From dd54511c4fa2554386925c6fe5c861afc1f824f8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 27 Mar 2018 09:39:59 +0000 Subject: [PATCH 1/4] Pass data as a function in begin_training methods --- spacy/language.py | 2 +- spacy/pipeline.pyx | 22 +++++++++++----------- spacy/syntax/nn_parser.pyx | 8 ++++---- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 4e74327a3..33225da48 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -463,7 +463,7 @@ class Language(object): self._optimizer = sgd for name, proc in self.pipeline: if hasattr(proc, 'begin_training'): - proc.begin_training(get_gold_tuples(), + proc.begin_training(get_gold_tuples, pipeline=self.pipeline, sgd=self._optimizer, **cfg) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index bcf42b724..01a2b16e4 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -184,7 +184,7 @@ class Pipe(object): return create_default_optimizer(self.model.ops, **self.cfg.get('optimizer', {})) - def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" @@ -386,7 +386,7 @@ class Tensorizer(Pipe): loss = (d_scores**2).sum() return loss, d_scores - def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, + def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): """Allocate models, pre-process training data and acquire an optimizer. @@ -510,11 +510,11 @@ class Tagger(Pipe): d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores - def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() - for raw_text, annots_brackets in gold_tuples: + for raw_text, annots_brackets in get_gold_tuples(): for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots for tag in tags: @@ -687,9 +687,9 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids, tensors=None): pass - def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None, + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None, sgd=None, **kwargs): - gold_tuples = nonproj.preprocess_training_data(gold_tuples) + gold_tuples = nonproj.preprocess_training_data(get_gold_tuples()) for raw_text, annots_brackets in gold_tuples: for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots @@ -962,7 +962,7 @@ class TextCategorizer(Pipe): self.labels.append(label) return 1 - def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None): + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None): if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': token_vector_width = pipeline[0].model.nO else: @@ -990,10 +990,10 @@ cdef class DependencyParser(Parser): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) - def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model[0] - labeller.begin_training(gold_tuples, pipeline=pipeline, + labeller.begin_training(get_gold_tuples, pipeline=pipeline, tok2vec=tok2vec, sgd=sgd) def __reduce__(self): @@ -1011,10 +1011,10 @@ cdef class EntityRecognizer(Parser): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) - def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg): + def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model[0] - labeller.begin_training(gold_tuples, pipeline=pipeline, + labeller.begin_training(get_gold_tuples, pipeline=pipeline, tok2vec=tok2vec) def __reduce__(self): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 759ccacab..af38b2c2a 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -833,11 +833,11 @@ cdef class Parser: copy_array(larger.b[:smaller.nO], smaller.b) self.model[-1]._layers[-1] = larger - def begin_training(self, gold_tuples, pipeline=None, sgd=None, **cfg): + def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): if 'model' in cfg: self.model = cfg['model'] cfg.setdefault('min_action_freq', 30) - actions = self.moves.get_actions(gold_parses=gold_tuples, + actions = self.moves.get_actions(gold_parses=get_gold_tuples(), min_freq=cfg.get('min_action_freq', 30)) self.moves.initialize_actions(actions) cfg.setdefault('token_vector_width', 128) @@ -849,7 +849,7 @@ cdef class Parser: self.model[1].begin_training( self.model[1].ops.allocate((5, cfg['token_vector_width']))) if pipeline is not None: - self.init_multitask_objectives(gold_tuples, pipeline, sgd=sgd, **cfg) + self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab) else: if sgd is None: @@ -863,7 +863,7 @@ cdef class Parser: # Defined in subclasses, to avoid circular import raise NotImplementedError - def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): + def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg): '''Setup models for secondary objectives, to benefit from multi-task learning. This method is intended to be overridden by subclasses. From 8bbd26579cad81a8a347ed8c097e389110569312 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 27 Mar 2018 09:53:35 +0000 Subject: [PATCH 2/4] Support GPU in UD training script --- spacy/cli/ud_train.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py index b827d4a4f..853cff9b3 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud_train.py @@ -254,7 +254,7 @@ def load_nlp(corpus, config): nlp.vocab.from_disk(Path(config.vectors) / 'vocab') return nlp -def initialize_pipeline(nlp, docs, golds, config): +def initialize_pipeline(nlp, docs, golds, config, device): nlp.add_pipe(nlp.create_pipe('parser')) if config.multitask_tag: nlp.parser.add_multitask_objective('tag') @@ -265,7 +265,7 @@ def initialize_pipeline(nlp, docs, golds, config): for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) - return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds)) + return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds), device=device) ######################## @@ -318,15 +318,14 @@ class TreebankPaths(object): "positional", None, str), parses_dir=("Directory to write the development parses", "positional", None, Path), config=("Path to json formatted config file", "positional"), - limit=("Size limit", "option", "n", int) + limit=("Size limit", "option", "n", int), + use_gpu=("Use GPU", "option", "g", int) ) -def main(ud_dir, parses_dir, config, corpus, limit=0): +def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1): + spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False - random.seed(0) - numpy.random.seed(0) - config = Config.load(config) paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): @@ -337,9 +336,9 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(), max_doc_length=config.max_doc_length, limit=limit) - optimizer = initialize_pipeline(nlp, docs, golds, config) + optimizer = initialize_pipeline(nlp, docs, golds, config, use_gpu) - batch_sizes = compounding(config.batch_size //10, config.batch_size, 1.001) + batch_sizes = compounding(config.batch_size//10, config.batch_size, 1.001) for i in range(config.nr_epoch): docs = [nlp.make_doc(doc.text) for doc in docs] Xs = list(zip(docs, golds)) From 987e1533a4907e35468c5cc55ca4b80131bf5036 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 27 Mar 2018 10:08:12 +0000 Subject: [PATCH 3/4] Use 8 features in parser --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index af38b2c2a..d772be40b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -736,7 +736,7 @@ cdef class Parser: lower, stream, drop=0.0) return (tokvecs, bp_tokvecs), state2vec, upper - nr_feature = 13 + nr_feature = 8 def get_token_ids(self, states): cdef StateClass state From 25280b7013a041a003f7eefe9c51a3f216d7c33d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 27 Mar 2018 10:08:38 +0000 Subject: [PATCH 4/4] Try to make sum_state_features faster --- spacy/syntax/nn_parser.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d772be40b..8fd870939 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -165,16 +165,17 @@ cdef void sum_state_features(float* output, cdef const float* feature padding = cached cached += F * O + cdef int id_stride = F*O + cdef float one = 1. for b in range(B): for f in range(F): if token_ids[f] < 0: feature = &padding[f*O] else: - idx = token_ids[f] * F * O + f*O + idx = token_ids[f] * id_stride + f*O feature = &cached[idx] - VecVec.add_i(output, - feature, 1., O) - output += O + openblas.simple_axpy(&output[b*O], O, + feature, one) token_ids += F