From 0962ee7ce6291d63d05a3269cd9c7b3443ee291a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jun 2020 13:49:21 +0200
Subject: [PATCH 1/6] fix output_dir (converted to Path by typer)

---
 spacy/cli/convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 3957fd27a..c946b55d4 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -62,7 +62,7 @@ def convert_cli(
         # We get an instance of the FileTypes from the CLI so we need its string value
         file_type = file_type.value
     input_path = Path(input_path)
-    output_dir = Path(output_dir) if output_dir != "-" else "-"
+    output_dir = "-" if output_dir == Path("-") else output_dir
     cli_args = locals()
     silent = output_dir == "-"
     msg = Printer(no_print=silent)
@@ -124,7 +124,7 @@ def convert(
             _print_docs_to_stdout(docs, file_type)
         else:
             subpath = input_loc.relative_to(input_path)
-            output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}")
+            output_file = output_dir / subpath.with_suffix(f".{file_type}")
             _write_docs_to_file(docs, output_file, file_type)
             msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
 

From 790b37390136bab87126c3eb69fb9354aef5c7c5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jun 2020 14:05:00 +0200
Subject: [PATCH 2/6] fix var

---
 spacy/ml/tb_framework.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index 21779ddaa..88f27f0bf 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -39,7 +39,8 @@ def forward(model, X, is_train):
 
 def init(model, X=None, Y=None):
     model.get_ref("tok2vec").initialize(X=X)
-    model.get_ref("lower").initialize()
+    lower = model.get_ref("lower")
+    lower.initialize()
     if model.attrs["has_upper"]:
         statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
         model.get_ref("upper").initialize(X=statevecs)

From 5cf3eeee0d4cd3906c94dacb1382cf4c0c149f89 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jun 2020 14:49:31 +0200
Subject: [PATCH 3/6] bugfix: update states after creating golds to avoid out
 of bounds indexing

---
 spacy/syntax/arc_eager.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 28787f97d..b57c4f312 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -615,8 +615,8 @@ cdef class ArcEager(TransitionSystem):
     def init_gold_batch(self, examples):
         states = self.init_batch([eg.predicted for eg in examples])
         keeps = [i for i, s in enumerate(states) if not s.is_final()]
-        states = [states[i] for i in keeps]
         golds = [ArcEagerGold(self, states[i], examples[i]) for i in keeps]
+        states = [states[i] for i in keeps]
         for gold in golds:
             self._replace_unseen_labels(gold)
         n_steps = sum([len(s.queue) * 4 for s in states])

From 351ab3a3d4fbe9f6fb1c8780b63b14c9b0da1ce1 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jun 2020 16:47:30 +0200
Subject: [PATCH 4/6] pull merge_sent into iob2docs to avoid Doc creation for
 each line

---
 spacy/gold/converters/iob2docs.py | 49 +++++++++++++++++++------------
 spacy/gold/converters/util.py     |  8 -----
 spacy/gold/gold_io.pyx            | 25 ++--------------
 spacy/tests/test_cli.py           | 15 +++++-----
 4 files changed, 41 insertions(+), 56 deletions(-)
 delete mode 100644 spacy/gold/converters/util.py

diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py
index aba23e1b3..27876ba7a 100644
--- a/spacy/gold/converters/iob2docs.py
+++ b/spacy/gold/converters/iob2docs.py
@@ -1,9 +1,9 @@
 from wasabi import Printer
 
+from .conll_ner2docs import n_sents_info
 from ...gold import iob_to_biluo, tags_to_entities
 from ...tokens import Doc, Span
-from .util import merge_sentences
-from .conll_ner2docs import n_sents_info
+from ...util import minibatch
 
 
 def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
@@ -19,31 +19,44 @@ def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
     I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
     """
     msg = Printer(no_print=no_print)
-    docs = read_iob(input_data.split("\n"), vocab)
     if n_sents > 0:
         n_sents_info(msg, n_sents)
-        docs = merge_sentences(docs, n_sents)
+    docs = read_iob(input_data.split("\n"), vocab, n_sents)
     return docs
 
 
-def read_iob(raw_sents, vocab):
+def read_iob(raw_sents, vocab, n_sents):
     docs = []
-    for line in raw_sents:
-        if not line.strip():
-            continue
-        tokens = [t.split("|") for t in line.split()]
-        if len(tokens[0]) == 3:
-            words, tags, iob = zip(*tokens)
-        elif len(tokens[0]) == 2:
-            words, iob = zip(*tokens)
-            tags = ["-"] * len(words)
-        else:
-            raise ValueError(
-                "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
-            )
+    for group in minibatch(raw_sents, size=n_sents):
+        tokens = []
+        words = []
+        tags = []
+        iob = []
+        sent_starts = []
+        for line in group:
+            if not line.strip():
+                continue
+            sent_tokens = [t.split("|") for t in line.split()]
+            if len(sent_tokens[0]) == 3:
+                sent_words, sent_tags, sent_iob = zip(*sent_tokens)
+            elif len(sent_tokens[0]) == 2:
+                sent_words, sent_iob = zip(*sent_tokens)
+                sent_tags = ["-"] * len(sent_words)
+            else:
+                raise ValueError(
+                    "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
+                )
+            words.extend(sent_words)
+            tags.extend(sent_tags)
+            iob.extend(sent_iob)
+            tokens.extend(sent_tokens)
+            sent_starts.append(True)
+            sent_starts.extend([False for _ in sent_words[1:]])
         doc = Doc(vocab, words=words)
         for i, tag in enumerate(tags):
             doc[i].tag_ = tag
+        for i, sent_start in enumerate(sent_starts):
+            doc[i].is_sent_start = sent_start
         biluo = iob_to_biluo(iob)
         entities = tags_to_entities(biluo)
         doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
diff --git a/spacy/gold/converters/util.py b/spacy/gold/converters/util.py
deleted file mode 100644
index 41b3e6d24..000000000
--- a/spacy/gold/converters/util.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from spacy.util import minibatch
-
-
-def merge_sentences(docs, n_sents):
-    merged = []
-    for group in minibatch(docs, size=n_sents):
-        raise NotImplementedError
-    return merged
diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx
index 10ca427ed..61ffb2cfb 100644
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@@ -6,37 +6,18 @@ from ..tokens import Doc
 from .iob_utils import biluo_tags_from_offsets
 
 
-def merge_sents(sents):
-    m_deps = [[], [], [], [], [], []]
-    m_cats = {}
-    m_brackets = []
-    i = 0
-    for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
-        m_deps[0].extend(id_ + i for id_ in ids)
-        m_deps[1].extend(words)
-        m_deps[2].extend(tags)
-        m_deps[3].extend(head + i for head in heads)
-        m_deps[4].extend(labels)
-        m_deps[5].extend(ner)
-        m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
-                          for b in brackets)
-        m_cats.update(cats)
-        i += len(ids)
-    return [(m_deps, (m_cats, m_brackets))]
-
-
-def docs_to_json(docs, id=0, ner_missing_tag="O"):
+def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
     """Convert a list of Doc objects into the JSON-serializable format used by
     the spacy train command.
 
     docs (iterable / Doc): The Doc object(s) to convert.
-    id (int): Id for the JSON.
+    doc_id (int): Id for the JSON.
     RETURNS (dict): The data in spaCy's JSON format
         - each input doc will be treated as a paragraph in the output doc
     """
     if isinstance(docs, Doc):
         docs = [docs]
-    json_doc = {"id": id, "paragraphs": []}
+    json_doc = {"id": doc_id, "paragraphs": []}
     for i, doc in enumerate(docs):
         json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
         for cat, val in doc.cats.items():
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 164961a5b..0cf070b61 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -117,7 +117,6 @@ def test_cli_converters_conllu2json_subtokens():
     assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 
 
-@pytest.mark.xfail
 def test_cli_converters_iob2json(en_vocab):
     lines = [
         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@@ -127,19 +126,19 @@ def test_cli_converters_iob2json(en_vocab):
     ]
     input_data = "\n".join(lines)
     converted_docs = iob2docs(input_data, en_vocab, n_sents=10)
+    assert len(converted_docs) == 1
     converted = docs_to_json(converted_docs)
-    assert len(converted) == 1
-    assert converted[0]["id"] == 0
-    assert len(converted[0]["paragraphs"]) == 1
-    assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
+    assert converted["id"] == 0
+    assert len(converted["paragraphs"]) == 1
+    assert len(converted["paragraphs"][0]["sentences"]) == 4
     for i in range(0, 4):
-        sent = converted[0]["paragraphs"][0]["sentences"][i]
+        sent = converted["paragraphs"][0]["sentences"][i]
         assert len(sent["tokens"]) == 8
         tokens = sent["tokens"]
         # fmt: off
         assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
-        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
-        # fmt: on
+
+    assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')]
 
 
 @pytest.mark.xfail

From 7c76a2b796a3dfc8f51cb6dd34b278d32441d2f4 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jun 2020 17:09:37 +0200
Subject: [PATCH 5/6] fix asserts

---
 spacy/tests/test_cli.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 0cf070b61..26d7ebd93 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -117,6 +117,7 @@ def test_cli_converters_conllu2json_subtokens():
     assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 
 
+@pytest.mark.xfail
 def test_cli_converters_iob2json(en_vocab):
     lines = [
         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@@ -137,6 +138,9 @@ def test_cli_converters_iob2json(en_vocab):
         tokens = sent["tokens"]
         # fmt: off
         assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
+    assert len(converted_docs[0].ents) == 8
+    for ent in converted_docs[0].ents:
+        assert(ent.text in ["New York City", "London"])
 
     assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')]
 

From 28ad71c1879586facbf77f7d0f68bc58a256171c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 23 Jun 2020 17:20:41 +0200
Subject: [PATCH 6/6] bugfix excl Span.end in iob2docs

---
 spacy/gold/converters/iob2docs.py |  2 +-
 spacy/gold/iob_utils.py           |  2 ++
 spacy/tests/test_cli.py           | 21 ++++++++-------------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py
index 27876ba7a..51321a470 100644
--- a/spacy/gold/converters/iob2docs.py
+++ b/spacy/gold/converters/iob2docs.py
@@ -59,6 +59,6 @@ def read_iob(raw_sents, vocab, n_sents):
             doc[i].is_sent_start = sent_start
         biluo = iob_to_biluo(iob)
         entities = tags_to_entities(biluo)
-        doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
+        doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities]
         docs.append(doc)
     return docs
diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py
index 3ae911418..b3d605296 100644
--- a/spacy/gold/iob_utils.py
+++ b/spacy/gold/iob_utils.py
@@ -172,6 +172,8 @@ def offsets_from_biluo_tags(doc, tags):
 
 
 def tags_to_entities(tags):
+    """ Note that the end index returned by this function is inclusive.
+    To use it for Span creation, increment the end by 1."""
     entities = []
     start = None
     for i, tag in enumerate(tags):
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 26d7ebd93..ca0f3710f 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -10,7 +10,6 @@ from spacy.cli.pretrain import make_docs
 # from spacy.gold.converters import conllu2docs
 
 
-@pytest.mark.xfail
 def test_cli_converters_conllu2json():
     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
     lines = [
@@ -35,7 +34,6 @@ def test_cli_converters_conllu2json():
     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
 
 
-@pytest.mark.xfail
 @pytest.mark.parametrize(
     "lines",
     [
@@ -73,7 +71,6 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
     assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 
 
-@pytest.mark.xfail
 def test_cli_converters_conllu2json_subtokens():
     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
     lines = [
@@ -117,7 +114,6 @@ def test_cli_converters_conllu2json_subtokens():
     assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 
 
-@pytest.mark.xfail
 def test_cli_converters_iob2json(en_vocab):
     lines = [
         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@@ -142,10 +138,7 @@ def test_cli_converters_iob2json(en_vocab):
     for ent in converted_docs[0].ents:
         assert(ent.text in ["New York City", "London"])
 
-    assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')]
 
-
-@pytest.mark.xfail
 def test_cli_converters_conll_ner2json():
     lines = [
         "-DOCSTART- -X- O O",
@@ -197,19 +190,21 @@ def test_cli_converters_conll_ner2json():
     ]
     input_data = "\n".join(lines)
     converted_docs = conll_ner2docs(input_data, n_sents=10)
+    assert len(converted_docs) == 1
     converted = docs_to_json(converted_docs)
-    assert len(converted) == 1
-    assert converted[0]["id"] == 0
-    assert len(converted[0]["paragraphs"]) == 1
-    assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
+    assert converted["id"] == 0
+    assert len(converted["paragraphs"]) == 1
+    assert len(converted["paragraphs"][0]["sentences"]) == 5
     for i in range(0, 5):
-        sent = converted[0]["paragraphs"][0]["sentences"][i]
+        sent = converted["paragraphs"][0]["sentences"][i]
         assert len(sent["tokens"]) == 8
         tokens = sent["tokens"]
         # fmt: off
         assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
-        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
         # fmt: on
+    assert len(converted_docs[0].ents) == 10
+    for ent in converted_docs[0].ents:
+        assert (ent.text in ["New York City", "London"])
 
 
 def test_pretrain_make_docs():