From e92be79ffcd5a6a5bf3267e2d0047c2a8abf2d14 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 15:34:34 +0200
Subject: [PATCH 01/10] Clean up debugging

---
 spacy/syntax/transition_system.pyx | 48 ++++++++++++++++--------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 839a27b79..e1ec40e0e 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -61,7 +61,7 @@ cdef class TransitionSystem:
             offset += len(doc)
         return states
 
-    def get_oracle_sequence(self, Example example):
+    def get_oracle_sequence(self, Example example, _debug=False):
         cdef Pool mem = Pool()
         # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
         assert self.n_moves > 0
@@ -70,6 +70,8 @@ cdef class TransitionSystem:
 
         cdef StateClass state
         states, golds, n_steps = self.init_gold_batch([example])
+        if not states:
+            return []
         state = states[0]
         gold = golds[0]
         history = []
@@ -82,30 +84,32 @@ cdef class TransitionSystem:
                     history.append(i)
                     s0 = state.S(0)
                     b0 = state.B(0)
-                    debug_log.append(" ".join((
-                        self.get_class_name(i),
-                        "S0=", (example.x[s0].text if s0 >= 0 else "__"),
-                        "B0=", (example.x[b0].text if b0 >= 0 else "__"),
-                        "S0 head?", str(state.has_head(state.S(0))),
-                    )))
+                    if _debug:
+                        debug_log.append(" ".join((
+                            self.get_class_name(i),
+                            "S0=", (example.x[s0].text if s0 >= 0 else "__"),
+                            "B0=", (example.x[b0].text if b0 >= 0 else "__"),
+                            "S0 head?", str(state.has_head(state.S(0))),
+                        )))
                     action.do(state.c, action.label)
                     break
             else:
-                print("Actions")
-                for i in range(self.n_moves):
-                    print(self.get_class_name(i))
-                print("Gold")
-                for token in example.y:
-                    print(token.text, token.dep_, token.head.text)
-                s0 = state.S(0)
-                b0 = state.B(0)
-                debug_log.append(" ".join((
-                    "?",
-                    "S0=", (example.x[s0].text if s0 >= 0 else "-"),
-                    "B0=", (example.x[b0].text if b0 >= 0 else "-"),
-                    "S0 head?", str(state.has_head(state.S(0))),
-                )))
-                print("\n".join(debug_log))
+                if _debug:
+                    print("Actions")
+                    for i in range(self.n_moves):
+                        print(self.get_class_name(i))
+                    print("Gold")
+                    for token in example.y:
+                        print(token.text, token.dep_, token.head.text)
+                    s0 = state.S(0)
+                    b0 = state.B(0)
+                    debug_log.append(" ".join((
+                        "?",
+                        "S0=", (example.x[s0].text if s0 >= 0 else "-"),
+                        "B0=", (example.x[b0].text if b0 >= 0 else "-"),
+                        "S0 head?", str(state.has_head(state.S(0))),
+                    )))
+                    print("\n".join(debug_log))
                 raise ValueError(Errors.E024)
         return history
 

From 5a2d37c18f3d9acdd676dcf60d408e8b6b3b3358 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 15:34:46 +0200
Subject: [PATCH 02/10] Xfail tests

---
 spacy/tests/parser/test_ner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 7199e229f..897d82936 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -89,6 +89,9 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
     assert names
 
 
+# We can't easily represent this on a Doc object. Not sure what the best solution
+# would be, but I don't think it's an important use case?
+@pytest.mark.xfail(reason="No longer supported")
 def test_oracle_moves_missing_B(en_vocab):
     words = ["B", "52", "Bomber"]
     biluo_tags = [None, None, "L-PRODUCT"]
@@ -111,7 +114,9 @@ def test_oracle_moves_missing_B(en_vocab):
             moves.add_action(move_types.index("U"), label)
     moves.get_oracle_sequence(example)
 
-
+# We can't easily represent this on a Doc object. Not sure what the best solution
+# would be, but I don't think it's an important use case?
+@pytest.mark.xfail(reason="No longer supported")
 def test_oracle_moves_whitespace(en_vocab):
     words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
     biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]

From 2de72b30fe3d50021e618cda8e026e71f7547b3c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 15:34:55 +0200
Subject: [PATCH 03/10] Remove prints

---
 spacy/gold/example.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 20f1a783e..166e58363 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -196,8 +196,6 @@ cdef class Example:
                     next_ner = x_tags[i+1] if (i+1) < self.x.length else None
                     if prev_ner == "O" or next_ner == "O":
                         x_tags[i] = "O"
-        #print("Y tags", y_tags)
-        #print("X tags", x_tags)
         return x_tags
 
     def to_dict(self):

From bc481d83385e52bfe83cb77a0cea23fc5e804b07 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 15:35:55 +0200
Subject: [PATCH 04/10] Remove print

---
 spacy/syntax/arc_eager.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index ad6b218df..58427a3a8 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -584,7 +584,6 @@ cdef class ArcEager(TransitionSystem):
                 for label, freq in list(label_freqs.items()):
                     if freq < min_freq:
                         label_freqs.pop(label)
-                        print("Removing", action, label, freq)
         # Ensure these actions are present
         actions[BREAK].setdefault('ROOT', 0)
         if kwargs.get("learn_tokens") is True:

From c65f0ed8f6234ae9bcf3fe926f6a0c8c86e2c010 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 15:54:04 +0200
Subject: [PATCH 05/10] Xfail some tests

---
 spacy/tests/parser/test_arc_eager_oracle.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index f0f41e645..954fd9a8a 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -65,6 +65,8 @@ def test_oracle_four_words(arc_eager, vocab):
     words = ["a", "b", "c", "d"]
     heads = [1, 1, 3, 3]
     deps = ["left", "ROOT", "left", "ROOT"]
+    for dep in deps:
+        arc_eager.add_label(dep)
     actions = ["L-left", "B-ROOT", "L-left"]
     state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
     assert state.is_final()

From 53931be9a18cb0c7dfda410ac667c49499244992 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 16:00:45 +0200
Subject: [PATCH 06/10] Replace unseen labels for parser

---
 spacy/syntax/arc_eager.pyx | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 58427a3a8..28787f97d 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -617,9 +617,29 @@ cdef class ArcEager(TransitionSystem):
         keeps = [i for i, s in enumerate(states) if not s.is_final()]
         states = [states[i] for i in keeps]
         golds = [ArcEagerGold(self, states[i], examples[i]) for i in keeps]
+        for gold in golds:
+            self._replace_unseen_labels(gold)
         n_steps = sum([len(s.queue) * 4 for s in states])
         return states, golds, n_steps
 
+    def _replace_unseen_labels(self, ArcEagerGold gold):
+        backoff_label = self.strings["dep"]
+        root_label = self.strings["ROOT"]
+        left_labels = self.labels[LEFT]
+        right_labels = self.labels[RIGHT]
+        break_labels = self.labels[BREAK]
+        for i in range(gold.c.length):
+            if not is_head_unknown(&gold.c, i):
+                head = gold.c.heads[i]
+                label = self.strings[gold.c.labels[i]]
+                if head > i and label not in left_labels:
+                    gold.c.labels[i] = backoff_label
+                elif head < i and label not in right_labels:
+                    gold.c.labels[i] = backoff_label
+                elif head == i and label not in break_labels:
+                    gold.c.labels[i] = root_label
+        return gold
+
     cdef Transition lookup_transition(self, object name_or_id) except *:
         if isinstance(name_or_id, int):
             return self.c[name_or_id]

From 4cd1c743857c082a998ca0eabbf2b100b0f9f8a3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 16:00:55 +0200
Subject: [PATCH 07/10] Update test

---
 spacy/tests/parser/test_arc_eager_oracle.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index 954fd9a8a..081be6df3 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -66,7 +66,8 @@ def test_oracle_four_words(arc_eager, vocab):
     heads = [1, 1, 3, 3]
     deps = ["left", "ROOT", "left", "ROOT"]
     for dep in deps:
-        arc_eager.add_label(dep)
+        arc_eager.add_action(2, dep)  # Left
+        arc_eager.add_action(3, dep)  # Right
     actions = ["L-left", "B-ROOT", "L-left"]
     state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
     assert state.is_final()
@@ -143,7 +144,7 @@ def test_get_oracle_actions():
     doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
     config = {
         "learn_tokens": False,
-        "min_action_freq": 30,
+        "min_action_freq": 0,
         "beam_width": 1,
         "beam_update_prob": 1.0,
     }

From 031673dc35e7bb9f2c16de6d98323e6c51b73302 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 16:08:01 +0200
Subject: [PATCH 08/10] Update test

---
 spacy/tests/regression/test_issue1501-2000.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 94996c410..8c989a7eb 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -288,7 +288,7 @@ def test_issue1967(label):
             "entities": [label],
         },
     )
-    assert "JOB-NAME" in ner.moves.get_actions(gold_parses=[example])[1]
+    assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1]
 
 
 def test_issue1971(en_vocab):

From fedfabec80d6ceb3c28e004f9a2c2c8e18d17670 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 16:11:42 +0200
Subject: [PATCH 09/10] Xfail test

---
 spacy/tests/parser/test_parse.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 06e363b6b..f13b7e847 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -46,6 +46,8 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
     assert doc[0].dep != 0
 
 
+# We removed the step_through API a while ago. we should bring it back though
+@pytest.mark.xfail(reason="Unsupported")
 def test_parser_initial(en_tokenizer, en_parser):
     text = "I ate the pizza with anchovies."
     # heads = [1, 0, 1, -2, -3, -1, -5]
@@ -89,7 +91,8 @@ def test_parser_merge_pp(en_tokenizer):
     assert doc[2].text == "another phrase"
     assert doc[3].text == "occurs"
 
-
+# We removed the step_through API a while ago. we should bring it back though
+@pytest.mark.xfail(reason="Unsupported")
 def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
     text = "a b c d e"
 

From afe6ee4548a76cdbad03b4949bb88cc6d6468bf3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 16:28:47 +0200
Subject: [PATCH 10/10] Fix Corpus

---
 spacy/gold/corpus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index 1244e2516..3318a7eef 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -72,7 +72,7 @@ class Corpus:
             i += 1
         return n
 
-    def train_dataset(self, nlp, shuffle=True):
+    def train_dataset(self, nlp, shuffle=True, **kwargs):
         ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
         examples = self.make_examples(nlp, ref_docs)
         if shuffle:
@@ -80,7 +80,7 @@ class Corpus:
             random.shuffle(examples)
         yield from examples
 
-    def dev_dataset(self, nlp):
+    def dev_dataset(self, nlp, **kwargs):
         ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
         examples = self.make_examples(nlp, ref_docs)
         yield from examples