From 9c33d4d1df484a773c0d3be3764e12e78bc41d7a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 27 Aug 2018 01:48:46 +0200 Subject: [PATCH] Add more hyper-parameters to spacy ud-train * subword_features: Controls whether subword features are used in the word embeddings. True by default (specifically, prefix, suffix and word shape). Should be set to False for languages like Chinese and Japanese. * conv_depth: Depth of the convolutional layers. Defaults to 4. --- spacy/cli/ud_train.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py index 68fedbbbf..0514d953d 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud_train.py @@ -290,7 +290,9 @@ def initialize_pipeline(nlp, docs, golds, config, device): for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) - return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds), device=device) + return nlp.begin_training( + lambda: golds_to_gold_tuples(docs, golds), device=device, + subword_features=config.subword_features, config.conv_depth=conv_depth) ######################## @@ -300,10 +302,10 @@ def initialize_pipeline(nlp, docs, golds, config, device): class Config(object): def __init__(self, vectors=None, max_doc_length=10, multitask_tag=True, multitask_sent=True, multitask_dep=True, multitask_vectors=False, - nr_epoch=30, batch_size=1000, dropout=0.2): + nr_epoch=30, batch_size=1000, dropout=0.2, + conv_depth=4, subword_features=True): for key, value in locals().items(): setattr(self, key, value) - @classmethod def load(cls, loc): @@ -365,7 +367,7 @@ def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=No nlp = load_nlp(paths.lang, config, vectors=vectors_dir) docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(), - max_doc_length=None, limit=limit) + max_doc_length=3, limit=limit) optimizer = initialize_pipeline(nlp, docs, golds, config, use_gpu)