From 36de9bf72a3e4fbae8940d53e6cecc3ca9515750 Mon Sep 17 00:00:00 2001
From: Matthw Honnibal <honnibal+gh@gmail.com>
Date: Fri, 18 Oct 2019 17:24:13 +0200
Subject: [PATCH] Add more spacy pretrain options

---
 spacy/cli/pretrain.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 13a2d9b36..4a03e3130 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -35,6 +35,8 @@ from .train import _load_pretrained_tok2vec
     output_dir=("Directory to write models to on each epoch", "positional", None, str),
     width=("Width of CNN layers", "option", "cw", int),
     depth=("Depth of CNN layers", "option", "cd", int),
+    use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
+    sa_depth=("Depth of self-attention layers", "option", "sa", int),
     bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
     embed_rows=("Number of embedding rows", "option", "er", int),
     loss_func=(
@@ -82,6 +84,8 @@ def pretrain(
     width=96,
     depth=4,
     bilstm_depth=0,
+    sa_depth=0,
+    use_chars=False,
     embed_rows=2000,
     loss_func="cosine",
     use_vectors=False,
@@ -157,9 +161,11 @@ def pretrain(
             embed_rows,
             conv_depth=depth,
             pretrained_vectors=pretrained_vectors,
+            char_embed=use_chars,
+            self_attn_depth=sa_depth, # Experimental.
             bilstm_depth=bilstm_depth,  # Requires PyTorch. Experimental.
             cnn_maxout_pieces=3,  # You can try setting this higher
-            subword_features=True,  # Set to False for Chinese etc
+            subword_features=not use_chars,  # Set to False for Chinese etc
         ),
     )
     # Load in pretrained weights