From d5509e098911605cd0dc974827e18ff54f48812e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Oct 2019 15:16:33 +0100 Subject: [PATCH] Support Mish activation (requires Thinc 7.3) (#4536) * Add arch for MishWindowEncoder * Support mish in tok2vec and conv window >=2 * Pass new tok2vec settings from parser * Syntax error * Fix tok2vec setting * Fix registration of MishWindowEncoder * Fix receptive field setting * Fix mish arch * Pass more options from parser * Support more tok2vec options in pretrain * Require thinc 7.3 * Add docs [ci skip] * Require thinc 7.3.0.dev0 to run CI * Run black * Fix typo * Update Thinc version Co-authored-by: Ines Montani --- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/_ml.py | 26 ++++++++++++++++---------- spacy/cli/pretrain.py | 14 +++++++++++--- spacy/ml/tok2vec.py | 19 +++++++++++++++++-- spacy/syntax/nn_parser.pyx | 12 +++++++++++- website/docs/api/cli.md | 12 +++++++++--- 7 files changed, 67 insertions(+), 22 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6d76c7233..ad7059f3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=7.2.0,<7.3.0 +thinc>=7.3.0,<7.4.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.3.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 2d4a06c2b..51e722354 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,14 +38,14 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=7.2.0,<7.3.0 + thinc>=7.3.0,<7.4.0 install_requires = setuptools numpy>=1.15.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=7.2.0,<7.3.0 + thinc>=7.3.0,<7.4.0 blis>=0.4.0,<0.5.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 diff --git a/spacy/_ml.py b/spacy/_ml.py index 6723e5cf9..5d388913b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -321,6 +321,7 @@ def Tok2Vec(width, embed_size, **kwargs): char_embed = kwargs.get("char_embed", False) conv_depth = kwargs.get("conv_depth", 4) bilstm_depth = kwargs.get("bilstm_depth", 0) + conv_window = kwargs.get("conv_window", 1) cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] @@ -362,16 +363,21 @@ def Tok2Vec(width, embed_size, **kwargs): "column": cols.index(ID), }, } - cnn_cfg = { - "arch": "spacy.MaxoutWindowEncoder.v1", - "config": { - "width": width, - "window_size": 1, - "pieces": cnn_maxout_pieces, - "depth": conv_depth, - }, - } - + if cnn_maxout_pieces >= 2: + cnn_cfg = { + "arch": "spacy.MaxoutWindowEncoder.v1", + "config": { + "width": width, + "window_size": conv_window, + "pieces": cnn_maxout_pieces, + "depth": conv_depth, + }, + } + else: + cnn_cfg = { + "arch": "spacy.MishWindowEncoder.v1", + "config": {"width": width, "window_size": conv_window, "depth": conv_depth}, + } bilstm_cfg = { "arch": "spacy.TorchBiLSTMEncoder.v1", "config": {"width": width, "depth": bilstm_depth}, diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 9b63b31f0..f7236f7de 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -35,6 +35,10 @@ from .train import _load_pretrained_tok2vec output_dir=("Directory to write models to on each epoch", "positional", None, str), width=("Width of CNN layers", "option", "cw", int), depth=("Depth of CNN layers", "option", "cd", int), + cnn_window=("Window size for CNN layers", "option", "cW", int), + cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int), + use_chars=("Whether to use character-based embedding", "flag", "chr", bool), + sa_depth=("Depth of self-attention layers", "option", "sa", int), bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int), embed_rows=("Number of embedding rows", "option", "er", int), loss_func=( @@ -81,7 +85,11 @@ def pretrain( output_dir, width=96, depth=4, - bilstm_depth=2, + bilstm_depth=0, + cnn_pieces=3, + sa_depth=0, + use_chars=False, + cnn_window=1, embed_rows=2000, loss_func="cosine", use_vectors=False, @@ -158,8 +166,8 @@ def pretrain( conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. - cnn_maxout_pieces=3, # You can try setting this higher - subword_features=True, # Set to False for Chinese etc + subword_features=not use_chars, # Set to False for Chinese etc + cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. ), ) # Load in pretrained weights diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index d78a45191..0e57cfb73 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -16,8 +16,8 @@ def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) encode = make_layer(config["@encode"]) - depth = config["@encode"]["config"]["depth"] - tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=depth)) + field_size = getattr(encode, "receptive_field", 0) + tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size)) tok2vec.cfg = config tok2vec.nO = encode.nO tok2vec.embed = embed @@ -84,6 +84,21 @@ def MaxoutWindowEncoder(config): ) model = clone(Residual(cnn), depth) model.nO = nO + model.receptive_field = nW * depth + return model + + +@register_architecture("spacy.MishWindowEncoder.v1") +def MishWindowEncoder(config): + from thinc.v2v import Mish + + nO = config["width"] + nW = config["window_size"] + depth = config["depth"] + + cnn = chain(ExtractWindow(nW=nW), LayerNorm(Mish(nO, nO * ((nW * 2) + 1)))) + model = clone(Residual(cnn), depth) + model.nO = nO return model diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 364007d10..0ed7e6952 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -57,7 +57,10 @@ cdef class Parser: subword_features = util.env_opt('subword_features', cfg.get('subword_features', True)) conv_depth = util.env_opt('conv_depth', cfg.get('conv_depth', 4)) + conv_window = util.env_opt('conv_window', cfg.get('conv_depth', 1)) + t2v_pieces = util.env_opt('cnn_maxout_pieces', cfg.get('cnn_maxout_pieces', 3)) bilstm_depth = util.env_opt('bilstm_depth', cfg.get('bilstm_depth', 0)) + self_attn_depth = util.env_opt('self_attn_depth', cfg.get('self_attn_depth', 0)) if depth != 1: raise ValueError(TempErrors.T004.format(value=depth)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', @@ -69,6 +72,8 @@ cdef class Parser: pretrained_vectors = cfg.get('pretrained_vectors', None) tok2vec = Tok2Vec(token_vector_width, embed_size, conv_depth=conv_depth, + conv_window=conv_window, + cnn_maxout_pieces=t2v_pieces, subword_features=subword_features, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth) @@ -90,7 +95,12 @@ cdef class Parser: 'hidden_width': hidden_width, 'maxout_pieces': parser_maxout_pieces, 'pretrained_vectors': pretrained_vectors, - 'bilstm_depth': bilstm_depth + 'bilstm_depth': bilstm_depth, + 'self_attn_depth': self_attn_depth, + 'conv_depth': conv_depth, + 'conv_window': conv_window, + 'embed_size': embed_size, + 'cnn_maxout_pieces': t2v_pieces } return ParserModel(tok2vec, lower, upper), cfg diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e41a07374..a37921f3c 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -446,8 +446,10 @@ improvement. ```bash $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] -[--width] [--depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length] -[--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start] +[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] +[--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] +[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save_every] +[--init-tok2vec] [--epoch-start] ``` | Argument | Type | Description | @@ -457,6 +459,10 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] | `output_dir` | positional | Directory to write models to on each epoch. | | `--width`, `-cw` | option | Width of CNN layers. | | `--depth`, `-cd` | option | Depth of CNN layers. | +| `--cnn-window`, `-cW` 2.2.2 | option | Window size for CNN layers. | +| `--cnn-pieces`, `-cP` 2.2.2 | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). | +| `--use-chars`, `-chr` 2.2.2 | flag | Whether to use character-based embedding. | +| `--sa-depth`, `-sa` 2.2.2 | option | Depth of self-attention layers. | | `--embed-rows`, `-er` | option | Number of embedding rows. | | `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | | `--dropout`, `-d` | option | Dropout rate. | @@ -469,7 +475,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] | `--n-save-every`, `-se` | option | Save model every X batches. | | `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | | `--epoch-start`, `-es` 2.1.5 | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. | -| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. | +| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. | ### JSONL format for raw text {#pretrain-jsonl}