From 792bf9476c77c567dd4a888037bff3be04c5567f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 14:26:38 +0200
Subject: [PATCH 1/5] Update tests

---
 spacy/tests/test_gold.py | 54 ++++++----------------------------------
 1 file changed, 7 insertions(+), 47 deletions(-)

diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 61b9ca57c..d0a05d48b 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -349,6 +349,8 @@ def test_iob_to_biluo():
         iob_to_biluo(bad_iob)
 
 
+# This test is outdated as we use DocBin now. It should probably be removed?
+@pytest.mark.xfail(reason="Outdated")
 def test_roundtrip_docs_to_json(doc):
     nlp = English()
     text = doc.text
@@ -366,7 +368,7 @@ def test_roundtrip_docs_to_json(doc):
     with make_tempdir() as tmpdir:
         json_file = tmpdir / "roundtrip.json"
         srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = Corpus(train=str(json_file), dev=str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
         assert len(doc) == goldcorpus.count_train()
@@ -387,39 +389,10 @@ def test_roundtrip_docs_to_json(doc):
     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
 
 
-@pytest.mark.xfail  # TODO do we need to do the projectivity differently?
-def test_projective_train_vs_nonprojective_dev(doc):
-    nlp = English()
-    deps = [t.dep_ for t in doc]
-    heads = [t.head.i for t in doc]
-
-    with make_tempdir() as tmpdir:
-        json_file = tmpdir / "test.json"
-        # write to JSON train dicts
-        srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = Corpus(str(json_file), str(json_file))
-
-        train_reloaded_example = next(goldcorpus.train_dataset(nlp))
-        train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
-
-        dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
-        dev_goldparse = get_parses_from_example(dev_reloaded_example)[0][1]
-
-    assert is_nonproj_tree([t.head.i for t in doc]) is True
-    assert is_nonproj_tree(train_goldparse.heads) is False
-    assert heads[:-1] == train_goldparse.heads[:-1]
-    assert heads[-1] != train_goldparse.heads[-1]
-    assert deps[:-1] == train_goldparse.labels[:-1]
-    assert deps[-1] != train_goldparse.labels[-1]
-
-    assert heads == dev_goldparse.heads
-    assert deps == dev_goldparse.labels
-
-
 # Hm, not sure where misalignment check would be handled? In the components too?
 # I guess that does make sense. A text categorizer doesn't care if it's
 # misaligned...
-@pytest.mark.xfail  # TODO
+@pytest.mark.xfail(reason="Outdated")
 def test_ignore_misaligned(doc):
     nlp = English()
     text = doc.text
@@ -450,6 +423,9 @@ def test_ignore_misaligned(doc):
         assert len(train_reloaded_example) == 0
 
 
+# We probably want the orth variant logic back, but this test won't be quite
+# right -- we need to go from DocBin.
+@pytest.mark.xfail(reason="Outdated")
 def test_make_orth_variants(doc):
     nlp = English()
     with make_tempdir() as tmpdir:
@@ -594,19 +570,3 @@ def test_split_sents(merged_dict):
     assert token_annotation_2["words"] == ["It", "is", "just", "me"]
     assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"]
     assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
-
-
-# This fails on some None value? Need to look into that.
-@pytest.mark.xfail  # TODO
-def test_tuples_to_example(vocab, merged_dict):
-    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
-    merged_dict = dict(merged_dict)
-    merged_dict["cats"] = cats
-    ex = Example.from_dict(Doc(vocab, words=merged_dict["words"]), merged_dict)
-    words = [token.text for token in ex.reference]
-    assert words == merged_dict["words"]
-    tags = [token.tag_ for token in ex.reference]
-    assert tags == merged_dict["tags"]
-    sent_starts = [token.is_sent_start for token in ex.reference]
-    assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]]
-    ex.reference.cats == cats

From f73fa77bb95be33dc5421ba52cf4d6d923fc6558 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 14:29:52 +0200
Subject: [PATCH 2/5] Update test

---
 spacy/tests/parser/test_add_label.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 87675e94d..5809f16b8 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -54,7 +54,8 @@ def test_add_label(parser):
         losses = {}
         doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
         gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
-        parser.update((doc, gold), sgd=sgd, losses=losses)
+        example = Example.from_dict(doc, gold)
+        parser.update([example], sgd=sgd, losses=losses)
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc = parser(doc)
     assert doc[0].dep_ == "right"

From ad50c8bacab8513363a02052924fc4b4b2abe458 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 14:30:08 +0200
Subject: [PATCH 3/5] Add missing costs to NER oracle

---
 spacy/syntax/ner.pyx | 67 +++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 23 deletions(-)

diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 04fbbefc1..4ee41779e 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -49,6 +49,10 @@ cdef class BiluoGold:
         self.mem = Pool()
         self.c = create_gold_state(self.mem, moves, stcls, example)
 
+    def update(self, StateClass stcls):
+        update_gold_state(&self.c, stcls)
+
+
 
 cdef GoldNERStateC create_gold_state(
     Pool mem,
@@ -58,30 +62,15 @@ cdef GoldNERStateC create_gold_state(
 ) except *:
     cdef GoldNERStateC gs
     gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    ner_tags = get_aligned_ner(example)
+    ner_tags = example.get_aligned_ner()
     for i, ner_tag in enumerate(ner_tags):
         gs.ner[i] = moves.lookup_transition(ner_tag)
     return gs
 
 
-def get_aligned_ner(Example example):
-    cand_to_gold = example.alignment.cand_to_gold
-    i2j_multi = example.alignment.i2j_multi
-    y_tags = biluo_tags_from_offsets(
-        example.y, 
-        [(e.start_char, e.end_char, e.label_) for e in example.y.ents]
-    )
-    x_tags = [None] * example.x.length
-    for i in range(example.x.length):
-        if example.x[i].is_space:
-            pass
-        elif cand_to_gold[i] is not None:
-            x_tags[i] = y_tags[cand_to_gold[i]]
-        elif i in i2j_multi:
-            # Assign O/- for many-to-one O/- NER tags
-            if y_tags[i2j_multi[i]] in ("O", "-"):
-                x_tags[i] = y_tags[i2j_multi[i]]
-    return y_tags
+cdef void update_gold_state(GoldNERStateC* gs, StateClass stcls) except *:
+    # We don't need to update each time, unlike the parser.
+    pass
 
 
 cdef do_func_t[N_MOVES] do_funcs
@@ -120,11 +109,12 @@ cdef class BiluoPushDown(TransitionSystem):
             for action in (BEGIN, IN, LAST, UNIT):
                 actions[action][entity_type] = 1
         moves = ('M', 'B', 'I', 'L', 'U')
-        for example in kwargs.get('gold_parses', []):
-            for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
-                if ner_tag != 'O' and ner_tag != '-':
+        for example in kwargs.get('examples', []):
+            for token in example.y:
+                ent_type = token.ent_type_
+                if ent_type:
                     for action in (BEGIN, IN, LAST, UNIT):
-                        actions[action][ner_tag] += 1
+                        actions[action][ent_type] += 1
         return actions
 
     @property
@@ -247,6 +237,37 @@ cdef class BiluoPushDown(TransitionSystem):
                     self.add_action(UNIT, st._sent[i].ent_type)
                     self.add_action(LAST, st._sent[i].ent_type)
 
+    def get_cost(self, StateClass stcls, gold, int i):
+        if not isinstance(gold, BiluoGold):
+            raise TypeError("Expected BiluoGold")
+        cdef BiluoGold gold_ = gold
+        gold_state = gold_.c
+        n_gold = 0
+        if self.c[i].is_valid(stcls.c, self.c[i].label):
+            cost = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
+        else:
+            cost = 9000
+        return cost
+
+    cdef int set_costs(self, int* is_valid, weight_t* costs,
+                       StateClass stcls, gold) except -1:
+        if not isinstance(gold, BiluoGold):
+            raise TypeError("Expected BiluoGold")
+        cdef BiluoGold gold_ = gold
+        gold_.update(stcls)
+        gold_state = gold_.c
+        n_gold = 0
+        for i in range(self.n_moves):
+            if self.c[i].is_valid(stcls.c, self.c[i].label):
+                is_valid[i] = True
+                costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
+                n_gold += costs[i] <= 0
+            else:
+                is_valid[i] = False
+                costs[i] = 9000
+        if n_gold < 1:
+            raise ValueError
+
 
 cdef class Missing:
     @staticmethod

From 497fef4b5ffb14e4d641befef0e38927894cb489 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 14:53:23 +0200
Subject: [PATCH 4/5] Update test

---
 spacy/tests/parser/test_ner.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 61e25ffee..ee143ae0d 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -51,7 +51,6 @@ def tsys(vocab, entity_types):
 
 def test_get_oracle_moves(tsys, doc, entity_annots):
     example = Example.from_dict(doc, {"entities": entity_annots})
-    tsys.preprocess_gold(example)
     act_classes = tsys.get_oracle_sequence(example)
     names = [tsys.get_class_name(act) for act in act_classes]
     assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
@@ -67,7 +66,6 @@ def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
             ex_dict["doc_annotation"]["entities"][i] = "-"
     example = Example.from_dict(doc, ex_dict)
 
-    tsys.preprocess_gold(example)
     act_classes = tsys.get_oracle_sequence(example)
     names = [tsys.get_class_name(act) for act in act_classes]
     assert names
@@ -77,7 +75,6 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab):
     doc = Doc(vocab, words=["A", "B", "C", "D"])
     entity_annots = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
     example = Example.from_dict(doc, {"entities": entity_annots})
-    tsys.preprocess_gold(example)
     act_classes = tsys.get_oracle_sequence(example)
     names = [tsys.get_class_name(act) for act in act_classes]
     assert names
@@ -87,7 +84,6 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
     doc = Doc(vocab, words=["A", "B", "C", "D"])
     entity_annots = ["O", "!O", "O", "!O"]
     example = Example.from_dict(doc, {"entities": []})
-    tsys.preprocess_gold(example)
     act_classes = tsys.get_oracle_sequence(example)
     names = [tsys.get_class_name(act) for act in act_classes]
     assert names
@@ -113,7 +109,6 @@ def test_oracle_moves_missing_B(en_vocab):
             moves.add_action(move_types.index("I"), label)
             moves.add_action(move_types.index("L"), label)
             moves.add_action(move_types.index("U"), label)
-    moves.preprocess_gold(example)
     moves.get_oracle_sequence(example)
 
 
@@ -134,7 +129,6 @@ def test_oracle_moves_whitespace(en_vocab):
         else:
             action, label = tag.split("-")
             moves.add_action(move_types.index(action), label)
-    moves.preprocess_gold(example)
     moves.get_oracle_sequence(example)
 
 

From 72ab21166d36f45084024c8d769b719a60e9ae89 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 22 Jun 2020 14:55:33 +0200
Subject: [PATCH 5/5] Work on Example.get_aligned_ner method

---
 spacy/gold/example.pyx | 58 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 7ddc59cda..bfc0eb1e8 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -5,6 +5,7 @@ from ..tokens.doc cimport Doc
 from ..attrs import IDS
 from .align cimport Alignment
 from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
+from .iob_utils import spans_from_biluo_tags
 from .align import Alignment
 from ..errors import Errors, AlignmentError
 from ..structs cimport TokenC
@@ -140,6 +141,63 @@ cdef class Example:
                     aligned_deps[cand_i] = deps[gold_i]
         return aligned_heads, aligned_deps
 
+    def get_aligned_ner(self):
+        cand_to_gold = self.alignment.cand_to_gold
+        gold_to_cand = self.alignment.gold_to_cand
+        i2j_multi = self.alignment.i2j_multi
+        j2i_multi = self.alignment.j2i_multi
+        y_tags = biluo_tags_from_offsets(
+            self.y, 
+            [(e.start_char, e.end_char, e.label_) for e in self.y.ents]
+        )
+        x_tags = [None] * self.x.length
+        for i in range(self.x.length):
+            if self.x[i].is_space:
+                pass
+            elif cand_to_gold[i] is not None:
+                x_tags[i] = y_tags[cand_to_gold[i]]
+            elif i in i2j_multi:
+                # Assign O/- for many-to-one O/- NER tags
+                if y_tags[i2j_multi[i]] in ("O", "-"):
+                    x_tags[i] = y_tags[i2j_multi[i]]
+        # Assign O/- for one-to-many O/- NER tags
+        for gold_i, cand_i in enumerate(gold_to_cand):
+            if y_tags[gold_i] in ("O", "-"):
+                if cand_i is None and gold_i in j2i_multi:
+                    x_tags[j2i_multi[gold_i]] = y_tags[gold_i]
+        # TODO: I'm copying this over from v2.x but this seems kind of nuts?
+        # If there is entity annotation and some tokens remain unaligned,
+        # align all entities at the character level to account for all
+        # possible token misalignments within the entity spans
+        if list(self.y.ents) and None in x_tags:
+            # Get offsets based on gold words and BILUO entities
+            aligned_offsets = []
+            aligned_spans = []
+            # Filter offsets to identify those that align with doc tokens
+            for span in spans_from_biluo_tags(self.x, x_tags):
+                if span and not span.text.isspace():
+                    aligned_offsets.append(
+                        (span.start_char, span.end_char, span.label_)
+                    )
+                    aligned_spans.append(span)
+            # Convert back to BILUO for doc tokens and assign NER for all
+            # aligned spans
+            aligned_tags = biluo_tags_from_offsets(self.x, aligned_offsets, missing=None)
+            for span in aligned_spans:
+                for i in range(span.start, span.end):
+                    x_tags[i] = aligned_tags[i]
+            # Prevent whitespace that isn't within entities from being tagged as
+            # an entity.
+            for i, token in enumerate(self.x):
+                if token.is_space:
+                    prev_ner = x_tags[i] if i >= 1 else None
+                    next_ner = x_tags[i+1] if (i+1) < self.x.length else None
+                    if prev_ner == "O" or next_ner == "O":
+                        x_tags[i] = "O"
+        #print("Y tags", y_tags)
+        #print("X tags", x_tags)
+        return x_tags
+
     def to_dict(self):
         return {
             "doc_annotation": {