From e92be79ffcd5a6a5bf3267e2d0047c2a8abf2d14 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 15:34:34 +0200 Subject: [PATCH 01/10] Clean up debugging --- spacy/syntax/transition_system.pyx | 48 ++++++++++++++++-------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 839a27b79..e1ec40e0e 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -61,7 +61,7 @@ cdef class TransitionSystem: offset += len(doc) return states - def get_oracle_sequence(self, Example example): + def get_oracle_sequence(self, Example example, _debug=False): cdef Pool mem = Pool() # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc assert self.n_moves > 0 @@ -70,6 +70,8 @@ cdef class TransitionSystem: cdef StateClass state states, golds, n_steps = self.init_gold_batch([example]) + if not states: + return [] state = states[0] gold = golds[0] history = [] @@ -82,30 +84,32 @@ cdef class TransitionSystem: history.append(i) s0 = state.S(0) b0 = state.B(0) - debug_log.append(" ".join(( - self.get_class_name(i), - "S0=", (example.x[s0].text if s0 >= 0 else "__"), - "B0=", (example.x[b0].text if b0 >= 0 else "__"), - "S0 head?", str(state.has_head(state.S(0))), - ))) + if _debug: + debug_log.append(" ".join(( + self.get_class_name(i), + "S0=", (example.x[s0].text if s0 >= 0 else "__"), + "B0=", (example.x[b0].text if b0 >= 0 else "__"), + "S0 head?", str(state.has_head(state.S(0))), + ))) action.do(state.c, action.label) break else: - print("Actions") - for i in range(self.n_moves): - print(self.get_class_name(i)) - print("Gold") - for token in example.y: - print(token.text, token.dep_, token.head.text) - s0 = state.S(0) - b0 = state.B(0) - debug_log.append(" ".join(( - "?", - "S0=", (example.x[s0].text if s0 >= 0 else "-"), - "B0=", (example.x[b0].text if b0 >= 0 else "-"), - "S0 head?", str(state.has_head(state.S(0))), - ))) - print("\n".join(debug_log)) + if _debug: + print("Actions") + for i in range(self.n_moves): + print(self.get_class_name(i)) + print("Gold") + for token in example.y: + print(token.text, token.dep_, token.head.text) + s0 = state.S(0) + b0 = state.B(0) + debug_log.append(" ".join(( + "?", + "S0=", (example.x[s0].text if s0 >= 0 else "-"), + "B0=", (example.x[b0].text if b0 >= 0 else "-"), + "S0 head?", str(state.has_head(state.S(0))), + ))) + print("\n".join(debug_log)) raise ValueError(Errors.E024) return history From 5a2d37c18f3d9acdd676dcf60d408e8b6b3b3358 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 15:34:46 +0200 Subject: [PATCH 02/10] Xfail tests --- spacy/tests/parser/test_ner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 7199e229f..897d82936 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -89,6 +89,9 @@ def test_get_oracle_moves_negative_O(tsys, vocab): assert names +# We can't easily represent this on a Doc object. Not sure what the best solution +# would be, but I don't think it's an important use case? +@pytest.mark.xfail(reason="No longer supported") def test_oracle_moves_missing_B(en_vocab): words = ["B", "52", "Bomber"] biluo_tags = [None, None, "L-PRODUCT"] @@ -111,7 +114,9 @@ def test_oracle_moves_missing_B(en_vocab): moves.add_action(move_types.index("U"), label) moves.get_oracle_sequence(example) - +# We can't easily represent this on a Doc object. Not sure what the best solution +# would be, but I don't think it's an important use case? +@pytest.mark.xfail(reason="No longer supported") def test_oracle_moves_whitespace(en_vocab): words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"] biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"] From 2de72b30fe3d50021e618cda8e026e71f7547b3c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 15:34:55 +0200 Subject: [PATCH 03/10] Remove prints --- spacy/gold/example.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 20f1a783e..166e58363 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -196,8 +196,6 @@ cdef class Example: next_ner = x_tags[i+1] if (i+1) < self.x.length else None if prev_ner == "O" or next_ner == "O": x_tags[i] = "O" - #print("Y tags", y_tags) - #print("X tags", x_tags) return x_tags def to_dict(self): From bc481d83385e52bfe83cb77a0cea23fc5e804b07 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 15:35:55 +0200 Subject: [PATCH 04/10] Remove print --- spacy/syntax/arc_eager.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index ad6b218df..58427a3a8 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -584,7 +584,6 @@ cdef class ArcEager(TransitionSystem): for label, freq in list(label_freqs.items()): if freq < min_freq: label_freqs.pop(label) - print("Removing", action, label, freq) # Ensure these actions are present actions[BREAK].setdefault('ROOT', 0) if kwargs.get("learn_tokens") is True: From c65f0ed8f6234ae9bcf3fe926f6a0c8c86e2c010 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 15:54:04 +0200 Subject: [PATCH 05/10] Xfail some tests --- spacy/tests/parser/test_arc_eager_oracle.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index f0f41e645..954fd9a8a 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -65,6 +65,8 @@ def test_oracle_four_words(arc_eager, vocab): words = ["a", "b", "c", "d"] heads = [1, 1, 3, 3] deps = ["left", "ROOT", "left", "ROOT"] + for dep in deps: + arc_eager.add_label(dep) actions = ["L-left", "B-ROOT", "L-left"] state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions) assert state.is_final() From 53931be9a18cb0c7dfda410ac667c49499244992 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 16:00:45 +0200 Subject: [PATCH 06/10] Replace unseen labels for parser --- spacy/syntax/arc_eager.pyx | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 58427a3a8..28787f97d 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -617,9 +617,29 @@ cdef class ArcEager(TransitionSystem): keeps = [i for i, s in enumerate(states) if not s.is_final()] states = [states[i] for i in keeps] golds = [ArcEagerGold(self, states[i], examples[i]) for i in keeps] + for gold in golds: + self._replace_unseen_labels(gold) n_steps = sum([len(s.queue) * 4 for s in states]) return states, golds, n_steps + def _replace_unseen_labels(self, ArcEagerGold gold): + backoff_label = self.strings["dep"] + root_label = self.strings["ROOT"] + left_labels = self.labels[LEFT] + right_labels = self.labels[RIGHT] + break_labels = self.labels[BREAK] + for i in range(gold.c.length): + if not is_head_unknown(&gold.c, i): + head = gold.c.heads[i] + label = self.strings[gold.c.labels[i]] + if head > i and label not in left_labels: + gold.c.labels[i] = backoff_label + elif head < i and label not in right_labels: + gold.c.labels[i] = backoff_label + elif head == i and label not in break_labels: + gold.c.labels[i] = root_label + return gold + cdef Transition lookup_transition(self, object name_or_id) except *: if isinstance(name_or_id, int): return self.c[name_or_id] From 4cd1c743857c082a998ca0eabbf2b100b0f9f8a3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 16:00:55 +0200 Subject: [PATCH 07/10] Update test --- spacy/tests/parser/test_arc_eager_oracle.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 954fd9a8a..081be6df3 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -66,7 +66,8 @@ def test_oracle_four_words(arc_eager, vocab): heads = [1, 1, 3, 3] deps = ["left", "ROOT", "left", "ROOT"] for dep in deps: - arc_eager.add_label(dep) + arc_eager.add_action(2, dep) # Left + arc_eager.add_action(3, dep) # Right actions = ["L-left", "B-ROOT", "L-left"] state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions) assert state.is_final() @@ -143,7 +144,7 @@ def test_get_oracle_actions(): doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) config = { "learn_tokens": False, - "min_action_freq": 30, + "min_action_freq": 0, "beam_width": 1, "beam_update_prob": 1.0, } From 031673dc35e7bb9f2c16de6d98323e6c51b73302 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 16:08:01 +0200 Subject: [PATCH 08/10] Update test --- spacy/tests/regression/test_issue1501-2000.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 94996c410..8c989a7eb 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -288,7 +288,7 @@ def test_issue1967(label): "entities": [label], }, ) - assert "JOB-NAME" in ner.moves.get_actions(gold_parses=[example])[1] + assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] def test_issue1971(en_vocab): From fedfabec80d6ceb3c28e004f9a2c2c8e18d17670 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 16:11:42 +0200 Subject: [PATCH 09/10] Xfail test --- spacy/tests/parser/test_parse.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 06e363b6b..f13b7e847 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -46,6 +46,8 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): assert doc[0].dep != 0 +# We removed the step_through API a while ago. we should bring it back though +@pytest.mark.xfail(reason="Unsupported") def test_parser_initial(en_tokenizer, en_parser): text = "I ate the pizza with anchovies." # heads = [1, 0, 1, -2, -3, -1, -5] @@ -89,7 +91,8 @@ def test_parser_merge_pp(en_tokenizer): assert doc[2].text == "another phrase" assert doc[3].text == "occurs" - +# We removed the step_through API a while ago. we should bring it back though +@pytest.mark.xfail(reason="Unsupported") def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): text = "a b c d e" From afe6ee4548a76cdbad03b4949bb88cc6d6468bf3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Jun 2020 16:28:47 +0200 Subject: [PATCH 10/10] Fix Corpus --- spacy/gold/corpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 1244e2516..3318a7eef 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -72,7 +72,7 @@ class Corpus: i += 1 return n - def train_dataset(self, nlp, shuffle=True): + def train_dataset(self, nlp, shuffle=True, **kwargs): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) examples = self.make_examples(nlp, ref_docs) if shuffle: @@ -80,7 +80,7 @@ class Corpus: random.shuffle(examples) yield from examples - def dev_dataset(self, nlp): + def dev_dataset(self, nlp, **kwargs): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) examples = self.make_examples(nlp, ref_docs) yield from examples