Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-10-28 12:43:55 +01:00
parent 92018b9cd4
commit c5e41247e8
9 changed files with 41 additions and 47 deletions

View File

@ -307,6 +307,7 @@ def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
import torch.nn import torch.nn
from thinc.api import with_square_sequences from thinc.api import with_square_sequences
from thinc.extra.wrappers import PyTorchWrapperRNN from thinc.extra.wrappers import PyTorchWrapperRNN
if depth == 0: if depth == 0:
return layerize(noop()) return layerize(noop())
model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout) model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
@ -322,7 +323,7 @@ def Tok2Vec(width, embed_size, **kwargs):
bilstm_depth = kwargs.get("bilstm_depth", 0) bilstm_depth = kwargs.get("bilstm_depth", 0)
cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}} doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
if char_embed: if char_embed:
embed_cfg = { embed_cfg = {
@ -332,13 +333,10 @@ def Tok2Vec(width, embed_size, **kwargs):
"chars": 6, "chars": 6,
"@mix": { "@mix": {
"arch": "spacy.LayerNormalizedMaxout.v1", "arch": "spacy.LayerNormalizedMaxout.v1",
"config": { "config": {"width": width, "pieces": 3},
"width": width,
"pieces": 3
}
}, },
"@embed_features": None "@embed_features": None,
} },
} }
else: else:
embed_cfg = { embed_cfg = {
@ -351,12 +349,9 @@ def Tok2Vec(width, embed_size, **kwargs):
"@pretrained_vectors": None, "@pretrained_vectors": None,
"@mix": { "@mix": {
"arch": "spacy.LayerNormalizedMaxout.v1", "arch": "spacy.LayerNormalizedMaxout.v1",
"config": { "config": {"width": width, "pieces": 3},
"width": width,
"pieces": 3
}
}, },
} },
} }
if pretrained_vectors: if pretrained_vectors:
embed_cfg["config"]["@pretrained_vectors"] = { embed_cfg["config"]["@pretrained_vectors"] = {
@ -364,8 +359,8 @@ def Tok2Vec(width, embed_size, **kwargs):
"config": { "config": {
"vectors_name": pretrained_vectors, "vectors_name": pretrained_vectors,
"width": width, "width": width,
"column": cols.index(ID) "column": cols.index(ID),
} },
} }
cnn_cfg = { cnn_cfg = {
"arch": "spacy.MaxoutWindowEncoder.v1", "arch": "spacy.MaxoutWindowEncoder.v1",
@ -373,35 +368,26 @@ def Tok2Vec(width, embed_size, **kwargs):
"width": width, "width": width,
"window_size": 1, "window_size": 1,
"pieces": cnn_maxout_pieces, "pieces": cnn_maxout_pieces,
"depth": conv_depth "depth": conv_depth,
} },
} }
bilstm_cfg = { bilstm_cfg = {
"arch": "spacy.TorchBiLSTMEncoder.v1", "arch": "spacy.TorchBiLSTMEncoder.v1",
"config": { "config": {"width": width, "depth": bilstm_depth},
"width": width,
"depth": bilstm_depth,
}
} }
if conv_depth == 0 and bilstm_depth == 0: if conv_depth == 0 and bilstm_depth == 0:
encode_cfg = {} encode_cfg = {}
elif conv_depth >= 1 and bilstm_depth >= 1: elif conv_depth >= 1 and bilstm_depth >= 1:
encode_cfg = { encode_cfg = {
"arch": "thinc.FeedForward.v1", "arch": "thinc.FeedForward.v1",
"config": { "config": {"children": [cnn_cfg, bilstm_cfg]},
"children": [cnn_cfg, bilstm_cfg]
}
} }
elif conv_depth >= 1: elif conv_depth >= 1:
encode_cfg = cnn_cfg encode_cfg = cnn_cfg
else: else:
encode_cfg = bilstm_cfg encode_cfg = bilstm_cfg
config = { config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
"@doc2feats": doc2feats_cfg,
"@embed": embed_cfg,
"@encode": encode_cfg
}
return new_ml.Tok2Vec(config) return new_ml.Tok2Vec(config)

View File

@ -262,8 +262,11 @@ def train(
exits=1, exits=1,
) )
train_docs = corpus.train_docs( train_docs = corpus.train_docs(
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, nlp,
ignore_misaligned=True noise_level=noise_level,
gold_preproc=gold_preproc,
max_length=0,
ignore_misaligned=True,
) )
train_labels = set() train_labels = set()
if textcat_multilabel: if textcat_multilabel:
@ -344,7 +347,7 @@ def train(
orth_variant_level=orth_variant_level, orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc, gold_preproc=gold_preproc,
max_length=0, max_length=0,
ignore_misaligned=True ignore_misaligned=True,
) )
if raw_text: if raw_text:
random.shuffle(raw_text) random.shuffle(raw_text)
@ -383,8 +386,11 @@ def train(
if hasattr(component, "cfg"): if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width component.cfg["beam_width"] = beam_width
dev_docs = list( dev_docs = list(
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc, corpus.dev_docs(
ignore_misaligned=True) nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
) )
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer() start_time = timer()
@ -401,8 +407,11 @@ def train(
if hasattr(component, "cfg"): if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width component.cfg["beam_width"] = beam_width
dev_docs = list( dev_docs = list(
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc, corpus.dev_docs(
ignore_misaligned=True) nlp_loaded,
gold_preproc=gold_preproc,
ignore_misaligned=True,
)
) )
start_time = timer() start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)

View File

@ -131,9 +131,7 @@ class Language(object):
Defaults = BaseDefaults Defaults = BaseDefaults
lang = None lang = None
factories = { factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
}
def __init__( def __init__(
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs

View File

@ -1 +1,5 @@
from .tok2vec import Tok2Vec # coding: utf8
from __future__ import unicode_literals
from .tok2vec import Tok2Vec # noqa: F401
from .common import FeedForward, LayerNormalizedMaxout # noqa: F401

View File

@ -13,6 +13,7 @@ def FeedForward(config):
model.cfg = config model.cfg = config
return model return model
@register_architecture("spacy.LayerNormalizedMaxout.v1") @register_architecture("spacy.LayerNormalizedMaxout.v1")
def LayerNormalizedMaxout(config): def LayerNormalizedMaxout(config):
width = config["width"] width = config["width"]

View File

@ -9,7 +9,6 @@ from thinc.misc import Residual, LayerNorm, FeatureExtracter
from ..util import make_layer, register_architecture from ..util import make_layer, register_architecture
from ._wire import concatenate_lists from ._wire import concatenate_lists
from .common import *
@register_architecture("spacy.Tok2Vec.v1") @register_architecture("spacy.Tok2Vec.v1")
@ -81,8 +80,7 @@ def MaxoutWindowEncoder(config):
depth = config["depth"] depth = config["depth"]
cnn = chain( cnn = chain(
ExtractWindow(nW=nW), ExtractWindow(nW=nW), LayerNorm(Maxout(nO, nO * ((nW * 2) + 1), pieces=nP))
LayerNorm(Maxout(nO, nO * ((nW * 2) + 1), pieces=nP)),
) )
model = clone(Residual(cnn), depth) model = clone(Residual(cnn), depth)
model.nO = nO model.nO = nO

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import srsly import srsly
from spacy.gold import GoldCorpus, json_to_tuple from spacy.gold import GoldCorpus
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
@ -94,4 +94,3 @@ json_data = [
], ],
} }
] ]

View File

@ -205,4 +205,3 @@ def test_align(tokens_a, tokens_b, expected):
# check symmetry # check symmetry
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected

View File

@ -96,14 +96,14 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
def test_prefer_gpu(): def test_prefer_gpu():
try: try:
import cupy import cupy # noqa: F401
except ImportError: except ImportError:
assert not prefer_gpu() assert not prefer_gpu()
def test_require_gpu(): def test_require_gpu():
try: try:
import cupy import cupy # noqa: F401
except ImportError: except ImportError:
with pytest.raises(ValueError): with pytest.raises(ValueError):
require_gpu() require_gpu()