From 98165e43a7f38f2719310324a90f5dcfd3e00ca7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 1 Apr 2018 08:44:35 +0000 Subject: [PATCH 1/5] Sometimes update beam with greedy oracle --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 7e55a668d..7030d9a3b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -552,7 +552,7 @@ cdef class Parser: if not any(self.moves.has_gold(gold) for gold in golds): return None assert len(docs) == len(golds) - if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: + if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: return self.update_beam(docs, golds, self.cfg['beam_width'], self.cfg['beam_density'], drop=drop, sgd=sgd, losses=losses) From 8a120fb4553684123fe280d3e6cd8a99cfe81c21 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 1 Apr 2018 08:45:00 +0000 Subject: [PATCH 2/5] Disable batch size compounding in ud-train --- spacy/cli/ud_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py index 853cff9b3..58d46e34a 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud_train.py @@ -338,7 +338,7 @@ def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1): optimizer = initialize_pipeline(nlp, docs, golds, config, use_gpu) - batch_sizes = compounding(config.batch_size//10, config.batch_size, 1.001) + batch_sizes = compounding(config.batch_size, config.batch_size, 1.001) for i in range(config.nr_epoch): docs = [nlp.make_doc(doc.text) for doc in docs] Xs = list(zip(docs, golds)) From 2b26c007cda336c797cd235732676ed471d53b99 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 29 Apr 2018 14:09:02 +0000 Subject: [PATCH 3/5] Revert "Disable batch size compounding in ud-train" This reverts commit 8a120fb4553684123fe280d3e6cd8a99cfe81c21. --- spacy/cli/ud_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud_train.py index 58d46e34a..853cff9b3 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud_train.py @@ -338,7 +338,7 @@ def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1): optimizer = initialize_pipeline(nlp, docs, golds, config, use_gpu) - batch_sizes = compounding(config.batch_size, config.batch_size, 1.001) + batch_sizes = compounding(config.batch_size//10, config.batch_size, 1.001) for i in range(config.nr_epoch): docs = [nlp.make_doc(doc.text) for doc in docs] Xs = list(zip(docs, golds)) From acebd010331f63a3d9ac9b0e70620f0765f059b2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 May 2018 14:19:22 +0000 Subject: [PATCH 4/5] Set cildren from heads in finalize doc --- spacy/syntax/arc_eager.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 0220e4443..2dd269a53 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -20,6 +20,7 @@ from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse, GoldParseC from ..structs cimport TokenC from ..errors import Errors +from ..tokens.doc cimport Doc, set_children_from_heads # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 @@ -530,8 +531,9 @@ cdef class ArcEager(TransitionSystem): if st._sent[i].head == 0: st._sent[i].dep = self.root_label - def finalize_doc(self, doc): + def finalize_doc(self, Doc doc): doc.is_parsed = True + set_children_from_heads(doc.c, doc.length) cdef int set_valid(self, int* output, const StateC* st) nogil: cdef bint[N_MOVES] is_valid From 8cd06cc763ad50c9c553bdc93fd201ad30368e4c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 May 2018 14:39:48 +0000 Subject: [PATCH 5/5] Try to fix root-outside-sentence bug --- spacy/syntax/_state.pxd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index a95a1910f..d082cee5c 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -314,8 +314,8 @@ cdef cppclass StateC: this._stack[this._s_i] = this.B(0) this._s_i += 1 this._b_i += 1 - if this.B_(0).sent_start == 1: - this.set_break(this.B(0)) + if this.safe_get(this.B_(0).l_edge).sent_start == 1: + this.set_break(this.B_(0).l_edge) if this._b_i > this._break: this._break = -1