2020-09-28 13:31:54 +03:00
|
|
|
[paths]
|
|
|
|
raw_text = null
|
|
|
|
|
2020-08-24 16:56:03 +03:00
|
|
|
[pretraining]
|
|
|
|
max_epochs = 1000
|
|
|
|
dropout = 0.2
|
|
|
|
n_save_every = null
|
2020-09-15 02:12:02 +03:00
|
|
|
component = "tok2vec"
|
|
|
|
layer = ""
|
2020-09-17 12:38:59 +03:00
|
|
|
corpus = "corpora.pretrain"
|
2020-09-15 02:12:02 +03:00
|
|
|
|
|
|
|
[pretraining.batcher]
|
|
|
|
@batchers = "spacy.batch_by_words.v1"
|
|
|
|
size = 3000
|
|
|
|
discard_oversize = false
|
|
|
|
tolerance = 0.2
|
|
|
|
get_length = null
|
|
|
|
|
2020-08-24 16:56:03 +03:00
|
|
|
[pretraining.objective]
|
|
|
|
type = "characters"
|
|
|
|
n_characters = 4
|
|
|
|
|
|
|
|
[pretraining.optimizer]
|
|
|
|
@optimizers = "Adam.v1"
|
|
|
|
beta1 = 0.9
|
|
|
|
beta2 = 0.999
|
|
|
|
L2_is_weight_decay = true
|
|
|
|
L2 = 0.01
|
|
|
|
grad_clip = 1.0
|
|
|
|
use_averages = true
|
|
|
|
eps = 1e-8
|
|
|
|
learn_rate = 0.001
|
2020-09-17 12:38:59 +03:00
|
|
|
|
|
|
|
[corpora]
|
|
|
|
|
|
|
|
[corpora.pretrain]
|
|
|
|
@readers = "spacy.JsonlReader.v1"
|
2020-09-28 13:05:23 +03:00
|
|
|
path = ${paths.raw_text}
|
2020-09-17 12:38:59 +03:00
|
|
|
min_length = 5
|
|
|
|
max_length = 500
|
|
|
|
limit = 0
|