From 0703f5986bb2347f3332cdc3851246c1b19ca4ba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 15 Mar 2019 00:48:39 +0100 Subject: [PATCH 01/11] Remove hack from beam --- spacy/syntax/_beam_utils.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index f06d54d9d..83137a4c4 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -209,10 +209,6 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, # Track the "maximum violation", to use in the update. for i, violn in enumerate(violns): violn.check_crf(pbeam[i], gbeam[i]) - # Use 'early update' if best gold is way out of contention. - if pbeam[i].loss > 0 and pbeam[i].min_score > (gbeam[i].score * 5.00): - pbeam.dones[i] = True - gbeam.dones[i] = True histories = [] losses = [] for violn in violns: From f762c36e618ea8c97fbe4fea0838f04b6beaf7da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 15 Mar 2019 15:18:28 +0100 Subject: [PATCH 02/11] Evaluate accuracy at multiple beam widths --- spacy/cli/train.py | 99 +++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c6ada957f..3411bdc68 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -200,9 +200,9 @@ def train( msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off - row_head = ("Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") + row_head = ("Itn", "Beam Width", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") row_settings = { - "widths": (3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7), + "widths": (3, 10, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7), "aligns": tuple(["r" for i in row_head]), "spacing": 2 } @@ -247,51 +247,61 @@ def train( epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) - dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) - nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, debug) - end_time = timer() - if use_gpu < 0: - gpu_wps = None - cpu_wps = nwords / (end_time - start_time) - else: - gpu_wps = nwords / (end_time - start_time) - with Model.use_device("cpu"): - nlp_loaded = util.load_model_from_path(epoch_model_path) - dev_docs = list( - corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) - ) - start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs) - end_time = timer() + for beam_width in [1, 4, 16, 128]: + for name, component in nlp_loaded.pipeline: + if hasattr(component, "cfg"): + component.cfg["beam_width"] = beam_width + dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) + nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs, debug) + end_time = timer() + if use_gpu < 0: + gpu_wps = None cpu_wps = nwords / (end_time - start_time) - acc_loc = output_path / ("model%d" % i) / "accuracy.json" - srsly.write_json(acc_loc, scorer.scores) + else: + gpu_wps = nwords / (end_time - start_time) + with Model.use_device("cpu"): + nlp_loaded = util.load_model_from_path(epoch_model_path) + nlp_loaded.parser.cfg["beam_width"] + dev_docs = list( + corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) + ) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs) + end_time = timer() + cpu_wps = nwords / (end_time - start_time) + acc_loc = output_path / ("model%d" % i) / "accuracy.json" + srsly.write_json(acc_loc, scorer.scores) - # Update model meta.json - meta["lang"] = nlp.lang - meta["pipeline"] = nlp.pipe_names - meta["spacy_version"] = ">=%s" % about.__version__ - meta["accuracy"] = scorer.scores - meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps} - meta["vectors"] = { - "width": nlp.vocab.vectors_length, - "vectors": len(nlp.vocab.vectors), - "keys": nlp.vocab.vectors.n_keys, - "name": nlp.vocab.vectors.name + # Update model meta.json + meta["lang"] = nlp.lang + meta["pipeline"] = nlp.pipe_names + meta["spacy_version"] = ">=%s" % about.__version__ + if beam_width == 1: + meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps} + meta["accuracy"] = scorer.scores + else: + meta.setdefault("beam_accuracy", {}) + meta.setdefault("beam_speed", {}) + meta["beam_accuracy"][beam_width] = scorer.scores + meta["beam_speed"][beam_width] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps} + meta["vectors"] = { + "width": nlp.vocab.vectors_length, + "vectors": len(nlp.vocab.vectors), + "keys": nlp.vocab.vectors.n_keys, + "name": nlp.vocab.vectors.name } - meta.setdefault("name", "model%d" % i) - meta.setdefault("version", version) - meta_loc = output_path / ("model%d" % i) / "meta.json" - srsly.write_json(meta_loc, meta) + meta.setdefault("name", "model%d" % i) + meta.setdefault("version", version) + meta_loc = output_path / ("model%d" % i) / "meta.json" + srsly.write_json(meta_loc, meta) + util.set_env_log(verbose) - util.set_env_log(verbose) - - progress = _get_progress( - i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps - ) - msg.row(progress, **row_settings) + progress = _get_progress( + i, beam_width, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps + ) + msg.row(progress, **row_settings) finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" @@ -377,7 +387,7 @@ def _get_metrics(component): return ("token_acc",) -def _get_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): +def _get_progress(itn, beam_width, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): scores = {} for col in [ "dep_loss", @@ -400,6 +410,7 @@ def _get_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): scores["gpu_wps"] = gpu_wps or 0.0 return [ itn, + beam_width, "{:.3f}".format(scores["dep_loss"]), "{:.3f}".format(scores["ner_loss"]), "{:.3f}".format(scores["uas"]), From ad56641324651b26978d00c833e4ef0c634a1df1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 15 Mar 2019 15:20:09 +0100 Subject: [PATCH 03/11] Fix Language.evaluate --- spacy/language.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 6432f3e98..4f1e59433 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -597,6 +597,8 @@ class Language(object): ): if scorer is None: scorer = Scorer() + if component_cfg is None: + component_cfg = {} docs, golds = zip(*docs_golds) docs = list(docs) golds = list(golds) From b94b2b11682be02dec1fd3e334a96f10be9f4769 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 15 Mar 2019 15:20:28 +0100 Subject: [PATCH 04/11] Export hash_state from beam_utils --- spacy/syntax/_beam_utils.pxd | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/_beam_utils.pxd b/spacy/syntax/_beam_utils.pxd index 7bae17558..36b0c05da 100644 --- a/spacy/syntax/_beam_utils.pxd +++ b/spacy/syntax/_beam_utils.pxd @@ -1,6 +1,9 @@ -from thinc.typedefs cimport class_t +from thinc.typedefs cimport class_t, hash_t # These are passed as callbacks to thinc.search.Beam cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1 cdef int check_final_state(void* _state, void* extra_args) except -1 + + +cdef hash_t hash_state(void* _state, void* _) except 0 From 693c8934e84c37f63e8e64dba37c7c3ff4b61583 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 15 Mar 2019 15:22:16 +0100 Subject: [PATCH 05/11] Normalize over all actions in parser, not just valid ones --- spacy/syntax/_parser_model.pyx | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index f664e6a2c..841e33432 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -156,7 +156,7 @@ cdef void cpu_log_loss(float* d_scores, """Do multi-label log loss""" cdef double max_, gmax, Z, gZ best = arg_max_if_gold(scores, costs, is_valid, O) - guess = arg_max_if_valid(scores, is_valid, O) + guess = Vec.arg_max(scores, O) if best == -1 or guess == -1: # These shouldn't happen, but if they do, we want to make sure we don't # cause an OOB access. @@ -166,14 +166,11 @@ cdef void cpu_log_loss(float* d_scores, max_ = scores[guess] gmax = scores[best] for i in range(O): - if is_valid[i]: - Z += exp(scores[i] - max_) - if costs[i] <= costs[best]: - gZ += exp(scores[i] - gmax) + Z += exp(scores[i] - max_) + if costs[i] <= costs[best]: + gZ += exp(scores[i] - gmax) for i in range(O): - if not is_valid[i]: - d_scores[i] = 0. - elif costs[i] <= costs[best]: + if costs[i] <= costs[best]: d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ) else: d_scores[i] = exp(scores[i]-max_) / Z From b13b2aeb541d88d2eebced5399bda4d2ad8fc672 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 15 Mar 2019 15:22:38 +0100 Subject: [PATCH 06/11] Use hash_state in beam --- spacy/syntax/_beam_utils.pyx | 2 +- spacy/syntax/nn_parser.pyx | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index 83137a4c4..dc482f278 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -96,7 +96,7 @@ cdef class ParserBeam(object): self._set_scores(beam, scores[i]) if self.golds is not None: self._set_costs(beam, self.golds[i], follow_gold=follow_gold) - beam.advance(transition_state, NULL, self.moves.c) + beam.advance(transition_state, hash_state, self.moves.c) beam.check_done(check_final_state, NULL) # This handles the non-monotonic stuff for the parser. if beam.is_done and self.golds is not None: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index cbeef756d..c140cc4f3 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -119,6 +119,8 @@ cdef class Parser: cfg['beam_width'] = util.env_opt('beam_width', 1) if 'beam_density' not in cfg: cfg['beam_density'] = util.env_opt('beam_density', 0.0) + if 'beam_update_prob' not in cfg: + cfg['beam_update_prob'] = util.env_opt('beam_update_prob', 1.0) cfg.setdefault('cnn_maxout_pieces', 3) self.cfg = cfg self.model = model @@ -383,7 +385,7 @@ cdef class Parser: self.moves.set_valid(beam.is_valid[i], state) memcpy(beam.scores[i], c_scores, scores.shape[1] * sizeof(float)) c_scores += scores.shape[1] - beam.advance(_beam_utils.transition_state, NULL, self.moves.c) + beam.advance(_beam_utils.transition_state, _beam_utils.hash_state, self.moves.c) beam.check_done(_beam_utils.check_final_state, NULL) return [b for b in beams if not b.is_done] From daa8c3787a61ebbb5786b6496aa4ab3f15c83f5c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 16 Mar 2019 15:02:39 +0100 Subject: [PATCH 07/11] Add eval_beam_widths argument to spacy train --- spacy/cli/train.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3411bdc68..e1d5cdefc 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -58,6 +58,7 @@ from .. import about str, ), noise_level=("Amount of corruption for data augmentation", "option", "nl", float), + eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), verbose=("Display more information for debug", "flag", "VV", bool), @@ -81,6 +82,7 @@ def train( parser_multitasks="", entity_multitasks="", noise_level=0.0, + eval_beam_widths="", gold_preproc=False, learn_tokens=False, verbose=False, @@ -134,6 +136,14 @@ def train( util.env_opt("batch_compound", 1.001), ) + if not eval_beam_widths: + eval_beam_widths = [1] + else: + eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] + if 1 not in eval_beam_widths: + eval_beam_widths.append(1) + eval_beam_widths.sort() + # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. @@ -247,7 +257,7 @@ def train( epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) - for beam_width in [1, 4, 16, 128]: + for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width From 7a354761c74b28f5773401f11e5888e4a003d74e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Mar 2019 15:55:13 +0100 Subject: [PATCH 08/11] Auto-format --- spacy/cli/train.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e1d5cdefc..c74ec8663 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -261,7 +261,9 @@ def train( for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width - dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) + dev_docs = list( + corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) + ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, debug) @@ -289,19 +291,27 @@ def train( meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: - meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps} + meta["speed"] = { + "nwords": nwords, + "cpu": cpu_wps, + "gpu": gpu_wps, + } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores - meta["beam_speed"][beam_width] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps} + meta["beam_speed"][beam_width] = { + "nwords": nwords, + "cpu": cpu_wps, + "gpu": gpu_wps, + } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, - "name": nlp.vocab.vectors.name - } + "name": nlp.vocab.vectors.name, + } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta_loc = output_path / ("model%d" % i) / "meta.json" From c94742ff6453a3edb1b4e8a839d6d2bb0e61c717 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Mar 2019 15:55:31 +0100 Subject: [PATCH 09/11] Only add beam width if customised --- spacy/cli/train.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c74ec8663..42965edd0 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -143,6 +143,7 @@ def train( if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() + has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If @@ -211,11 +212,11 @@ def train( # fmt: off row_head = ("Itn", "Beam Width", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") - row_settings = { - "widths": (3, 10, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7), - "aligns": tuple(["r" for i in row_head]), - "spacing": 2 - } + row_widths = (3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7) + if has_beam_widths: + row_head.insert(1, "Beam W.") + row_widths.insert(1, 7) + row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) @@ -318,8 +319,11 @@ def train( srsly.write_json(meta_loc, meta) util.set_env_log(verbose) + progress_args = [i, losses, scorer.scores] + if has_beam_widths: + progress_args.inset(1, beam_with) progress = _get_progress( - i, beam_width, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps + *progress_args, cpu_wps=cpu_wps, gpu_wps=gpu_wps ) msg.row(progress, **row_settings) finally: From e7aa25d9b1107a07896f34bfef4bb7f0c630a5ad Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Mar 2019 16:02:47 +0100 Subject: [PATCH 10/11] Fix beam width integration --- spacy/cli/train.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 42965edd0..b97293b16 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -211,7 +211,7 @@ def train( msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off - row_head = ("Itn", "Beam Width", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") + row_head = ("Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") row_widths = (3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7) if has_beam_widths: row_head.insert(1, "Beam W.") @@ -319,11 +319,13 @@ def train( srsly.write_json(meta_loc, meta) util.set_env_log(verbose) - progress_args = [i, losses, scorer.scores] - if has_beam_widths: - progress_args.inset(1, beam_with) progress = _get_progress( - *progress_args, cpu_wps=cpu_wps, gpu_wps=gpu_wps + i, + losses, + scorer.scores, + beam_width=beam_width if has_beam_widths else None, + cpu_wps=cpu_wps, + gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) finally: @@ -411,7 +413,7 @@ def _get_metrics(component): return ("token_acc",) -def _get_progress(itn, beam_width, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): +def _get_progress(itn, losses, dev_scores, beam_width=None, cpu_wps=0.0, gpu_wps=0.0): scores = {} for col in [ "dep_loss", @@ -432,9 +434,8 @@ def _get_progress(itn, beam_width, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0) scores.update(dev_scores) scores["cpu_wps"] = cpu_wps scores["gpu_wps"] = gpu_wps or 0.0 - return [ + result = [ itn, - beam_width, "{:.3f}".format(scores["dep_loss"]), "{:.3f}".format(scores["ner_loss"]), "{:.3f}".format(scores["uas"]), @@ -446,3 +447,6 @@ def _get_progress(itn, beam_width, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0) "{:.0f}".format(scores["cpu_wps"]), "{:.0f}".format(scores["gpu_wps"]), ] + if beam_width is not None: + result.insert(1, beam_width) + return result From 0f8739c7cb9c2e2b59766cfb51159e0ddef814d1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Mar 2019 16:04:15 +0100 Subject: [PATCH 11/11] Update train.py --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b97293b16..743fec9ea 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -211,8 +211,8 @@ def train( msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off - row_head = ("Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS") - row_widths = (3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7) + row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"] + row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] if has_beam_widths: row_head.insert(1, "Beam W.") row_widths.insert(1, 7)