From c705a284382fe7fba5cc367ef20adff36ae00cb7 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Jun 2020 11:22:24 +0200
Subject: [PATCH 01/49] add links to to_dict

---
 spacy/gold/example.pyx   | 10 +++++++++-
 spacy/tests/test_gold.py |  7 +++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index 402228994..b5d1b1402 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -126,7 +126,7 @@ cdef class Example:
             "doc_annotation": {
                 "cats": dict(self.reference.cats),
                 "entities": biluo_tags_from_doc(self.reference),
-                "links": [], # TODO
+                "links": self._links_to_dict()
             },
             "token_annotation": {
                 "ids": [t.i+1 for t in self.reference],
@@ -141,6 +141,14 @@ cdef class Example:
             }
         }
 
+    def _links_to_dict(self):
+        links = {}
+        for ent in self.reference.ents:
+            if ent.kb_id_:
+                links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
+        return links
+
+
     def split_sents(self):
         """ Split the token annotations into multiple Examples based on
         sent_starts and return a list of the new Examples"""
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index d98a93f2f..9e63f8a98 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -200,13 +200,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
     words = ["I flew", "to", "San Francisco", "Valley", "."]
     spaces = [True, True, True, False, False]
     doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}}
+    offset_start = len("I flew to ")
+    offset_end = len("I flew to San Francisco Valley")
+    entities = [(offset_start, offset_end, "LOC")]
+    links = {(offset_start, offset_end): {"Q816843": 1.0}}
     gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
     example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
     assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
     assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
     assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
+    assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0}
 
     # additional whitespace tokens in GoldParse words
     words, spaces = get_words_and_spaces(

From 25b0674320c7fcb49921b484129c7e6d4bece272 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Jun 2020 11:31:01 +0200
Subject: [PATCH 02/49] clean up

---
 spacy/language.py          | 14 --------------
 spacy/syntax/arc_eager.pyx |  6 +++---
 spacy/syntax/nonproj.pyx   |  4 ++--
 spacy/tests/test_gold.py   |  2 ++
 4 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index c168afeea..b9a84e1bb 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -646,20 +646,6 @@ class Language(object):
             sgd(W, dW, key=key)
         return losses
 
-    def preprocess_gold(self, examples):
-        """Can be called before training to pre-process gold data. By default,
-        it handles nonprojectivity and adds missing tags to the tag map.
-
-        examples (iterable): `Example` objects.
-        YIELDS (tuple): `Example` objects.
-        """
-        # TODO: This is deprecated right?
-        for name, proc in self.pipeline:
-            if hasattr(proc, "preprocess_gold"):
-                examples = proc.preprocess_gold(examples)
-        for eg in examples:
-            yield eg
-
     def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
         """Allocate models, pre-process training data and acquire a trainer and
         optimizer. Used as a contextmanager.
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 960f9f2c2..1c4484c33 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -459,9 +459,9 @@ cdef class ArcEager(TransitionSystem):
             actions[RIGHT][label] = 1
             actions[REDUCE][label] = 1
         for example in kwargs.get('gold_parses', []):
-            heads, labels = nonproj.projectivize(example.token_annotation.heads,
-                                                 example.token_annotation.deps)
-            for child, head, label in zip(example.token_annotation.ids, heads, labels):
+            heads, labels = nonproj.projectivize(example.get_aligned("HEAD"),
+                                                 example.get_aligned("DEP"))
+            for child, head, label in zip(example.get_aligned("ID"), heads, labels):
                 if label.upper() == 'ROOT' :
                     label = 'ROOT'
                 if head == child:
diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx
index 5b1f57d2b..eded53fac 100644
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@@ -78,8 +78,8 @@ def is_decorated(label):
 def count_decorated_labels(gold_data):
     freqs = {}
     for example in gold_data:
-        proj_heads, deco_deps = projectivize(example.token_annotation.heads,
-                                             example.token_annotation.deps)
+        proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
+                                             example.get_aligned("DEP"))
         # set the label to ROOT for each root dependent
         deco_deps = ['ROOT' if head == i else deco_deps[i]
                        for i, head in enumerate(proj_heads)]
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 9e63f8a98..f76b0c1e1 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -497,6 +497,8 @@ def test_split_sents(merged_dict):
         Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
         merged_dict
     )
+    assert example.text == "Hi there everyone It is just me"
+
     assert len(get_parses_from_example(
         example,
         merge=False,

From e30ec9b2a8beebe988e0ecce944ac40a8918c4f9 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Fri, 19 Jun 2020 14:05:35 +0200
Subject: [PATCH 03/49] fix test checking for variants

---
 examples/experiments/onto-joint/defaults.cfg  |  1 -
 examples/experiments/onto-joint/pretrain.cfg  |  1 -
 .../ptb-joint-pos-dep/bilstm_tok2vec.cfg      |  1 -
 .../ptb-joint-pos-dep/defaults.cfg            |  1 -
 spacy/cli/__init__.py                         |  2 +-
 spacy/cli/{train_from_config.py => train.py}  |  1 -
 spacy/gold/augment.py                         | 29 ++++++-----------
 spacy/gold/corpus.py                          | 32 ++++++++-----------
 spacy/tests/test_gold.py                      | 18 ++---------
 9 files changed, 27 insertions(+), 59 deletions(-)
 rename spacy/cli/{train_from_config.py => train.py} (99%)

diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg
index f76336d84..337fe0379 100644
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@@ -9,7 +9,6 @@ max_length = 0
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-noise_level = 0.0
 dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg
index 40885b6e8..83991f888 100644
--- a/examples/experiments/onto-joint/pretrain.cfg
+++ b/examples/experiments/onto-joint/pretrain.cfg
@@ -9,7 +9,6 @@ max_length = 0
 limit = 0
 # Data augmentation
 orth_variant_level = 0.0
-noise_level = 0.0
 dropout = 0.1
 # Controls early-stopping. 0 or -1 mean unlimited.
 patience = 1600
diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
index 905b5b4e0..f1b702a4e 100644
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@@ -6,7 +6,6 @@ init_tok2vec = null
 vectors = null
 max_epochs = 100
 orth_variant_level = 0.0
-noise_level = 0.0
 gold_preproc = true
 max_length = 0
 use_gpu = 0
diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
index 7383116e7..1c946ac60 100644
--- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg
@@ -6,7 +6,6 @@ init_tok2vec = null
 vectors = null
 max_epochs = 100
 orth_variant_level = 0.0
-noise_level = 0.0
 gold_preproc = true
 max_length = 0
 use_gpu = -1
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 2ffbe2d0c..6f09c6884 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -4,7 +4,7 @@ from .download import download  # noqa: F401
 from .info import info  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
-from .train_from_config import train_cli  # noqa: F401
+from .train import train_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .evaluate import evaluate  # noqa: F401
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train.py
similarity index 99%
rename from spacy/cli/train_from_config.py
rename to spacy/cli/train.py
index 3a4d28356..fb4347158 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train.py
@@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg):
         train_examples = list(
             corpus.train_dataset(
                 nlp,
-                noise_level=cfg["noise_level"], # I think this is deprecated?
                 orth_variant_level=cfg["orth_variant_level"],
                 gold_preproc=cfg["gold_preproc"],
                 max_length=cfg["max_length"],
diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py
index a129793c8..dda51cda6 100644
--- a/spacy/gold/augment.py
+++ b/spacy/gold/augment.py
@@ -2,6 +2,15 @@ import random
 import itertools
 
 
+def make_orth_variants_example(nlp, example, orth_variant_level=0.0):  # TODO: naming
+    raw_text = example.text
+    orig_dict = example.to_dict()
+    variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level)
+    doc = nlp.make_doc(variant_text)
+    orig_dict["token_annotation"] = variant_token_annot
+    return example.from_dict(doc, orig_dict)
+
+
 def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
     if random.random() >= orth_variant_level:
         return raw_text, orig_token_dict
@@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
                 raw_idx += 1
         raw = variant_raw
     return raw, token_dict
-
-
-def add_noise(orig, noise_level):
-    if random.random() >= noise_level:
-        return orig
-    elif type(orig) == list:
-        corrupted = [_corrupt(word, noise_level) for word in orig]
-        corrupted = [w for w in corrupted if w]
-        return corrupted
-    else:
-        return "".join(_corrupt(c, noise_level) for c in orig)
-
-
-def _corrupt(c, noise_level):
-    if random.random() >= noise_level:
-        return c
-    elif c in [".", "'", "!", "?", ","]:
-        return "\n"
-    else:
-        return c.lower()
diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
index d55845fb8..c84f8355f 100644
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@@ -8,7 +8,7 @@ from ..tokens import Doc
 from .. import util
 from ..errors import Errors, AlignmentError
 from .gold_io import read_json_file, json_to_annotations
-from .augment import make_orth_variants, add_noise
+from .augment import make_orth_variants
 from .example import Example
 
 
@@ -148,7 +148,6 @@ class GoldCorpus(object):
         nlp,
         gold_preproc=False,
         max_length=None,
-        noise_level=0.0,
         orth_variant_level=0.0,
         ignore_misaligned=False,
     ):
@@ -160,7 +159,6 @@ class GoldCorpus(object):
             train_annotations,
             gold_preproc,
             max_length=max_length,
-            noise_level=noise_level,
             orth_variant_level=orth_variant_level,
             make_projective=True,
             ignore_misaligned=ignore_misaligned,
@@ -194,33 +192,31 @@ class GoldCorpus(object):
         annotations,
         gold_preproc,
         max_length=None,
-        noise_level=0.0,
         orth_variant_level=0.0,
         make_projective=False,
         ignore_misaligned=False,
     ):
         """ Setting gold_preproc will result in creating a doc per sentence """
         for eg_dict in annotations:
+            token_annot = eg_dict.get("token_annotation", {})
             if eg_dict["text"]:
-                example = Example.from_dict(
-                    nlp.make_doc(eg_dict["text"]),
-                    eg_dict
-                )
+                doc = nlp.make_doc(eg_dict["text"])
+            elif "words" in token_annot:
+                doc = Doc(nlp.vocab, words=token_annot["words"])
             else:
-                example = Example.from_dict(
-                    Doc(nlp.vocab, words=eg_dict["words"]),
-                    eg_dict
-                )
+                raise ValueError("Expecting either 'text' or token_annotation.words annotation")
+
             if gold_preproc:
-                # TODO: Data augmentation
+                variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
+                doc = nlp.make_doc(variant_text)
+                eg_dict["token_annotation"] = variant_token_annot
+                example = Example.from_dict(doc, eg_dict)
                 examples = example.split_sents()
+
             else:
+                example = Example.from_dict(doc, eg_dict)
                 examples = [example]
+
             for eg in examples:
                 if (not max_length) or len(eg.predicted) < max_length:
-                    if ignore_misaligned:
-                        try:
-                            _ = eg._deprecated_get_gold()
-                        except AlignmentError:
-                            continue
                     yield eg
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index f76b0c1e1..726492138 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -11,6 +11,7 @@ import pytest
 import srsly
 
 from .util import make_tempdir
+from ..gold.augment import make_orth_variants_example
 
 
 @pytest.fixture
@@ -387,8 +388,8 @@ def test_make_orth_variants(doc):
         goldcorpus = GoldCorpus(str(json_file), str(json_file))
 
         # due to randomness, test only that this runs with no errors for now
-        train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
-        train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
+        train_example = next(goldcorpus.train_dataset(nlp))
+        variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
 
 
 @pytest.mark.parametrize(
@@ -499,19 +500,6 @@ def test_split_sents(merged_dict):
     )
     assert example.text == "Hi there everyone It is just me"
 
-    assert len(get_parses_from_example(
-        example,
-        merge=False,
-        vocab=nlp.vocab,
-        make_projective=False)
-    ) == 2
-    assert len(get_parses_from_example(
-        example,
-        merge=True,
-        vocab=nlp.vocab,
-        make_projective=False
-    )) == 1
-
     split_examples = example.split_sents()
     assert len(split_examples) == 2
     assert split_examples[0].text == "Hi there everyone "

From 161d8439fab3f2635f20bfc3fb1efa491a359722 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:19:40 +0200
Subject: [PATCH 04/49] Start updating converters

---
 spacy/cli/converters/__init__.py       |  4 +-
 spacy/cli/converters/conll_ner2json.py | 59 ++++++++++++++------------
 spacy/cli/converters/iob2json.py       | 40 ++++++-----------
 spacy/cli/converters/jsonl2json.py     |  9 ++--
 4 files changed, 51 insertions(+), 61 deletions(-)

diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py
index 9dcbf5b13..e44ad407d 100644
--- a/spacy/cli/converters/__init__.py
+++ b/spacy/cli/converters/__init__.py
@@ -1,4 +1,4 @@
 from .conllu2json import conllu2json  # noqa: F401
-from .iob2json import iob2json  # noqa: F401
+from .iob2json import iob2docs # noqa: F401
 from .conll_ner2json import conll_ner2json  # noqa: F401
-from .jsonl2json import ner_jsonl2json  # noqa: F401
+from .jsonl2docs import ner_jsonl2json  # noqa: F401
diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py
index b607d5913..8d4139bde 100644
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@@ -3,15 +3,16 @@ from wasabi import Printer
 from ...gold import iob_to_biluo
 from ...lang.xx import MultiLanguage
 from ...tokens.doc import Doc
+from ...vocab import Vocab
 from ...util import load_model
 
 
-def conll_ner2json(
+def conll_ner2doc(
     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
     """
     Convert files in the CoNLL-2003 NER format and similar
-    whitespace-separated columns into JSON format for use with train cli.
+    whitespace-separated columns into Doc objects.
 
     The first column is the tokens, the final column is the IOB tags. If an
     additional second column is present, the second column is the tags.
@@ -81,17 +82,25 @@ def conll_ner2json(
             "No document delimiters found. Use `-n` to automatically group "
             "sentences into documents."
         )
+
+    if model:
+        nlp = load_model(model)
+    else:
+        nlp = MultiLanguage()
     output_docs = []
-    for doc in input_data.strip().split(doc_delimiter):
-        doc = doc.strip()
-        if not doc:
+    for conll_doc in input_data.strip().split(doc_delimiter):
+        conll_doc = conll_doc.strip()
+        if not conll_doc:
             continue
-        output_doc = []
-        for sent in doc.split("\n\n"):
-            sent = sent.strip()
+        words = []
+        sent_starts = []
+        pos_tags = []
+        biluo_tags = []
+        for conll_sent in conll_doc.split("\n\n"):
+            conll_sent = conll_sent.strip()
             if not sent:
                 continue
-            lines = [line.strip() for line in sent.split("\n") if line.strip()]
+            lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
             cols = list(zip(*[line.split() for line in lines]))
             if len(cols) < 2:
                 raise ValueError(
@@ -99,25 +108,19 @@ def conll_ner2json(
                     "Try checking whitespace and delimiters. See "
                     "https://spacy.io/api/cli#convert"
                 )
-            words = cols[0]
-            iob_ents = cols[-1]
-            if len(cols) > 2:
-                tags = cols[1]
-            else:
-                tags = ["-"] * len(words)
-            biluo_ents = iob_to_biluo(iob_ents)
-            output_doc.append(
-                {
-                    "tokens": [
-                        {"orth": w, "tag": tag, "ner": ent}
-                        for (w, tag, ent) in zip(words, tags, biluo_ents)
-                    ]
-                }
-            )
-        output_docs.append(
-            {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
-        )
-        output_doc = []
+            length = len(cols[0])
+            words.extend(cols[0])
+            sent_stats.extend([True] + [False] * (length - 1))
+            biluo_tags.extend(iob_to_biluo(cols[-1]))
+            pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
+
+        doc = Doc(nlp.vocab, words=words)
+        for i, token in enumerate(doc):
+            token.tag_ = pos_tags[i]
+            token.is_sent_start = sent_starts[i]
+        entities = tags_to_entities(biluo_tags)
+        doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities]
+        output_docs.append(doc)
     return output_docs
 
 
diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py
index b6ac234fc..2addc1af4 100644
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@@ -1,14 +1,15 @@
 from wasabi import Printer
 
-from ...gold import iob_to_biluo
+from ...gold import iob_to_biluo, tags_to_entities
 from ...util import minibatch
+from .util import merge_sentences
 from .conll_ner2json import n_sents_info
 
 
-def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
+def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
     """
     Convert IOB files with one sentence per line and tags separated with '|'
-    into JSON format for use with train cli. IOB and IOB2 are accepted.
+    into Doc objects so they can be saved. IOB and IOB2 are accepted.
 
     Sample formats:
 
@@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
 
 
 def read_iob(raw_sents):
-    sentences = []
+    docs = []
     for line in raw_sents:
         if not line.strip():
             continue
         tokens = [t.split("|") for t in line.split()]
         if len(tokens[0]) == 3:
-            words, pos, iob = zip(*tokens)
+            words, tags, iob = zip(*tokens)
         elif len(tokens[0]) == 2:
             words, iob = zip(*tokens)
-            pos = ["-"] * len(words)
+            tags = ["-"] * len(words)
         else:
             raise ValueError(
                 "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
             )
+        doc = Doc(vocab, words=words)
+        for i, tag in enumerate(pos):
+            doc[i].tag_ = tag
         biluo = iob_to_biluo(iob)
-        sentences.append(
-            [
-                {"orth": w, "tag": p, "ner": ent}
-                for (w, p, ent) in zip(words, pos, biluo)
-            ]
-        )
-    sentences = [{"tokens": sent} for sent in sentences]
-    paragraphs = [{"sentences": [sent]} for sent in sentences]
-    docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
+        entities = biluo_tags_to_entities(biluo)
+        doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
+        docs.append(doc)
     return docs
-
-
-def merge_sentences(docs, n_sents):
-    merged = []
-    for group in minibatch(docs, size=n_sents):
-        group = list(group)
-        first = group.pop(0)
-        to_extend = first["paragraphs"][0]["sentences"]
-        for sent in group:
-            to_extend.extend(sent["paragraphs"][0]["sentences"])
-        merged.append(first)
-    return merged
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py
index 525063b22..8639a11b9 100644
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@@ -4,15 +4,17 @@ from ...gold import docs_to_json
 from ...util import get_lang_class, minibatch
 
 
-def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
+def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
     if lang is None:
         raise ValueError("No --lang specified, but tokenization required")
-    json_docs = []
+    docs = []
     input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
     nlp = get_lang_class(lang)()
     sentencizer = nlp.create_pipe("sentencizer")
     for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
         docs = []
+        # TODO: Should we be merging these? We're disrespecting the n_sents
+        # currently.
         for record in batch:
             raw_text = record["text"]
             if "entities" in record:
@@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_)
             spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
             doc.ents = _cleanup_spans(spans)
             docs.append(doc)
-        json_docs.append(docs_to_json(docs, id=i))
-    return json_docs
+    return docs
 
 
 def _cleanup_spans(spans):

From c630cfdb5e28a8dbb1126e8e90e0574516fe177b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:20:34 +0200
Subject: [PATCH 05/49] Move converters under spacy.gold

---
 spacy/{cli => gold}/converters/__init__.py       | 0
 spacy/{cli => gold}/converters/conll_ner2json.py | 0
 spacy/{cli => gold}/converters/conllu2json.py    | 0
 spacy/{cli => gold}/converters/iob2json.py       | 0
 spacy/{cli => gold}/converters/jsonl2json.py     | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename spacy/{cli => gold}/converters/__init__.py (100%)
 rename spacy/{cli => gold}/converters/conll_ner2json.py (100%)
 rename spacy/{cli => gold}/converters/conllu2json.py (100%)
 rename spacy/{cli => gold}/converters/iob2json.py (100%)
 rename spacy/{cli => gold}/converters/jsonl2json.py (100%)

diff --git a/spacy/cli/converters/__init__.py b/spacy/gold/converters/__init__.py
similarity index 100%
rename from spacy/cli/converters/__init__.py
rename to spacy/gold/converters/__init__.py
diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/gold/converters/conll_ner2json.py
similarity index 100%
rename from spacy/cli/converters/conll_ner2json.py
rename to spacy/gold/converters/conll_ner2json.py
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/gold/converters/conllu2json.py
similarity index 100%
rename from spacy/cli/converters/conllu2json.py
rename to spacy/gold/converters/conllu2json.py
diff --git a/spacy/cli/converters/iob2json.py b/spacy/gold/converters/iob2json.py
similarity index 100%
rename from spacy/cli/converters/iob2json.py
rename to spacy/gold/converters/iob2json.py
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/gold/converters/jsonl2json.py
similarity index 100%
rename from spacy/cli/converters/jsonl2json.py
rename to spacy/gold/converters/jsonl2json.py

From f61d5e3ac354df372cc6320482626856ea027135 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:23:58 +0200
Subject: [PATCH 06/49] Move things around

---
 spacy/gold/converters/__init__.py                         | 8 ++++----
 .../converters/{conll_ner2json.py => conll_ner2doc.py}    | 0
 spacy/gold/converters/{iob2json.py => iob2doc.py}         | 0
 spacy/gold/converters/{jsonl2json.py => jsonl2docs.py}    | 0
 spacy/gold/converters/util.py                             | 5 +++++
 5 files changed, 9 insertions(+), 4 deletions(-)
 rename spacy/gold/converters/{conll_ner2json.py => conll_ner2doc.py} (100%)
 rename spacy/gold/converters/{iob2json.py => iob2doc.py} (100%)
 rename spacy/gold/converters/{jsonl2json.py => jsonl2docs.py} (100%)
 create mode 100644 spacy/gold/converters/util.py

diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py
index e44ad407d..a046466fc 100644
--- a/spacy/gold/converters/__init__.py
+++ b/spacy/gold/converters/__init__.py
@@ -1,4 +1,4 @@
-from .conllu2json import conllu2json  # noqa: F401
-from .iob2json import iob2docs # noqa: F401
-from .conll_ner2json import conll_ner2json  # noqa: F401
-from .jsonl2docs import ner_jsonl2json  # noqa: F401
+from .conllu2docs import conllu2docs  # noqa: F401
+from .iob2docs import iob2docs # noqa: F401
+from .conll_ner2docs import conll_ner2docs  # noqa: F401
+from .jsonl2docs import ner_jsonl2docs  # noqa: F401
diff --git a/spacy/gold/converters/conll_ner2json.py b/spacy/gold/converters/conll_ner2doc.py
similarity index 100%
rename from spacy/gold/converters/conll_ner2json.py
rename to spacy/gold/converters/conll_ner2doc.py
diff --git a/spacy/gold/converters/iob2json.py b/spacy/gold/converters/iob2doc.py
similarity index 100%
rename from spacy/gold/converters/iob2json.py
rename to spacy/gold/converters/iob2doc.py
diff --git a/spacy/gold/converters/jsonl2json.py b/spacy/gold/converters/jsonl2docs.py
similarity index 100%
rename from spacy/gold/converters/jsonl2json.py
rename to spacy/gold/converters/jsonl2docs.py
diff --git a/spacy/gold/converters/util.py b/spacy/gold/converters/util.py
new file mode 100644
index 000000000..ed9c84203
--- /dev/null
+++ b/spacy/gold/converters/util.py
@@ -0,0 +1,5 @@
+def merge_sentences(docs, n_sents):
+    merged = []
+    for group in minibatch(docs, size=n_sents):
+        raise NotImplementedError
+    return merged

From e20a7808672816e8c7c936a3ace63c126c95ff41 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:24:49 +0200
Subject: [PATCH 07/49] Fix naming

---
 spacy/gold/converters/{conll_ner2doc.py => conll_ner2docs.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename spacy/gold/converters/{conll_ner2doc.py => conll_ner2docs.py} (100%)

diff --git a/spacy/gold/converters/conll_ner2doc.py b/spacy/gold/converters/conll_ner2docs.py
similarity index 100%
rename from spacy/gold/converters/conll_ner2doc.py
rename to spacy/gold/converters/conll_ner2docs.py

From d9a8fdf4b74cf65ad31f28f9a8ee1f20de6fb2fa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:26:36 +0200
Subject: [PATCH 08/49] Fix name

---
 spacy/gold/converters/{iob2doc.py => iob2docs.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename spacy/gold/converters/{iob2doc.py => iob2docs.py} (100%)

diff --git a/spacy/gold/converters/iob2doc.py b/spacy/gold/converters/iob2docs.py
similarity index 100%
rename from spacy/gold/converters/iob2doc.py
rename to spacy/gold/converters/iob2docs.py

From 3a73d95dccba3d9f04323000ceb438bad0471ea4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:50:13 +0200
Subject: [PATCH 09/49] Update converter to produce DocBin

---
 spacy/cli/convert.py | 65 ++++++++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 2ffbeb458..e2b6efc33 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -2,6 +2,7 @@ from pathlib import Path
 from wasabi import Printer
 import srsly
 import re
+import sys
 
 from .converters import conllu2json, iob2json, conll_ner2json
 from .converters import ner_jsonl2json
@@ -11,15 +12,29 @@ from .converters import ner_jsonl2json
 # matched by file extension and content. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
-CONVERTERS = {
-    "conllubio": conllu2json,
-    "conllu": conllu2json,
-    "conll": conllu2json,
-    "ner": conll_ner2json,
-    "iob": iob2json,
-    "jsonl": ner_jsonl2json,
+
+DOC_CONVERTERS = {
+    "conllubio": conllu2doc,
+    "conllu": conllu2doc,
+    "conll": conllu2doc,
+    "ner": conll_ner2doc,
+    "iob": iob2doc,
+    "jsonl": ner_jsonl2doc,
+    "json": json2docs,
 }
 
+
+ALL_ATTRS = [
+    "ORTH",
+    "TAG",
+    "HEAD",
+    "DEP",
+    "SENT_START",
+    "ENT_IOB",
+    "ENT_TYPE",
+    "LEMMA",
+    "MORPH",
+]
 # File types
 FILE_TYPES = ("json", "jsonl", "msg")
 FILE_TYPES_STDOUT = ("json", "jsonl")
@@ -82,7 +97,7 @@ def convert(
         ner_map = srsly.read_json(ner_map_path)
     # Use converter function to convert data
     func = CONVERTERS[converter]
-    data = func(
+    docs = func(
         input_data,
         n_sents=n_sents,
         seg_sents=seg_sents,
@@ -93,23 +108,27 @@ def convert(
         no_print=no_print,
         ner_map=ner_map,
     )
-    if output_dir != "-":
-        # Export data to a file
-        suffix = f".{file_type}"
-        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
-        if file_type == "json":
-            srsly.write_json(output_file, data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl(output_file, data)
-        elif file_type == "msg":
-            srsly.write_msgpack(output_file, data)
-        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
+    if write_json:
+        data = docs2json(docs)
     else:
-        # Print to stdout
-        if file_type == "json":
+        data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes()
+ 
+    if output_dir == "-":
+        if write_json:
             srsly.write_json("-", data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl("-", data)
+        else:
+            sys.stdout.write(data)
+    else:
+        # Export data to a file
+        if write_json:
+            suffix = f".{file_type}"
+            output_file = output_dir / input_path.parts[-1].with_suffix(suffix)
+            srsly.write_json(output_file, data)
+        else:
+            output_file = output_dir / input_path.parts[-1].with_suffix("spacy")
+            with output_file.open("wb") as file_:
+                file_.write(data)
+        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
 
 
 def autodetect_ner_format(input_data):

From 95df02875827d9abd93664e164bdb4bed5468a73 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:50:23 +0200
Subject: [PATCH 10/49] Update converters

---
 spacy/gold/converters/__init__.py       | 4 +++-
 spacy/gold/converters/conll_ner2docs.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py
index a046466fc..6ccc0f8f5 100644
--- a/spacy/gold/converters/__init__.py
+++ b/spacy/gold/converters/__init__.py
@@ -1,4 +1,6 @@
-from .conllu2docs import conllu2docs  # noqa: F401
 from .iob2docs import iob2docs # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
 from .jsonl2docs import ner_jsonl2docs  # noqa: F401
+
+# TODO: Update this one
+#from .conllu2docs import conllu2docs  # noqa: F401
diff --git a/spacy/gold/converters/conll_ner2docs.py b/spacy/gold/converters/conll_ner2docs.py
index 8d4139bde..7042bd7d6 100644
--- a/spacy/gold/converters/conll_ner2docs.py
+++ b/spacy/gold/converters/conll_ner2docs.py
@@ -7,7 +7,7 @@ from ...vocab import Vocab
 from ...util import load_model
 
 
-def conll_ner2doc(
+def conll_ner2docs(
     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
     """

From 0d22c6e006e27b34351b6b7ff361f367628fade2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 03:50:36 +0200
Subject: [PATCH 11/49] Allow DocBin to take list of Doc objects.

---
 spacy/tokens/_serialize.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index d3f49550c..7bf3faab3 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -39,7 +39,7 @@ class DocBin(object):
     document from the DocBin.
     """
 
-    def __init__(self, attrs=None, store_user_data=False):
+    def __init__(self, attrs=None, store_user_data=False, docs=[]):
         """Create a DocBin object to hold serialized annotations.
 
         attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
@@ -59,6 +59,8 @@ class DocBin(object):
         self.user_data = []
         self.strings = set()
         self.store_user_data = store_user_data
+        for doc in docs:
+            self.add(docs)
 
     def __len__(self):
         """RETURNS: The number of Doc objects added to the DocBin."""

From 7a846921a36706b58a3ceea1a89e58407956b68b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 15:55:35 +0200
Subject: [PATCH 12/49] Make spacy convert output docbin

---
 spacy/cli/convert.py | 223 ++++++++++++++++++++++++++++---------------
 1 file changed, 144 insertions(+), 79 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index e2b6efc33..4cf960379 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -4,8 +4,9 @@ import srsly
 import re
 import sys
 
-from .converters import conllu2json, iob2json, conll_ner2json
-from .converters import ner_jsonl2json
+from ..tokens import DocBin
+from ..gold.converters import iob2docs, conll_ner2docs, json2docs
+from ..gold.converters import ner_jsonl2docs
 
 
 # Converters are matched by file extension except for ner/iob, which are
@@ -13,13 +14,13 @@ from .converters import ner_jsonl2json
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
 
-DOC_CONVERTERS = {
-    "conllubio": conllu2doc,
-    "conllu": conllu2doc,
-    "conll": conllu2doc,
-    "ner": conll_ner2doc,
-    "iob": iob2doc,
-    "jsonl": ner_jsonl2doc,
+CONVERTERS = {
+    #"conllubio": conllu2docs, TODO
+    #"conllu": conllu2docs, TODO
+    #"conll": conllu2docs, TODO
+    "ner": conll_ner2docs,
+    "iob": iob2docs,
+    "jsonl": ner_jsonl2docs,
     "json": json2docs,
 }
 
@@ -42,93 +43,58 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
 
 def convert(
     # fmt: off
-    input_file: ("Input file", "positional", None, str),
-    output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
-    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
+    input_path: ("Input file or directory", "positional", None, Path),
+    output_dir: ("Output directory.", "positional", None, Path),
+    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
     n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
     seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
     model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
     morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
     merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
     converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
-    ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
+    ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
     lang: ("Language (if tokenizer required)", "option", "l", str) = None,
     # fmt: on
 ):
     """
-    Convert files into JSON format for use with train command and other
-    experiment management functions. If no output_dir is specified, the data
-    is written to stdout, so you can pipe them forward to a JSON file:
-    $ spacy convert some_file.conllu > some_file.json
+    Convert files into json or DocBin format for use with train command and other
+    experiment management functions.
     """
+    cli_args = locals()
     no_print = output_dir == "-"
+    output_dir = Path(output_dir) if output_dir != "-" else "-"
     msg = Printer(no_print=no_print)
-    input_path = Path(input_file)
-    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
-        # TODO: support msgpack via stdout in srsly?
-        msg.fail(
-            f"Can't write .{file_type} data to stdout",
-            "Please specify an output directory.",
-            exits=1,
+    verify_cli_args(msg, **cli_args)
+    converter = _get_converter(msg, converter, input_path)
+    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
+    for input_loc in walk_directory(input_path):
+        input_data = input_loc.open("r", encoding="utf-8").read()
+        # Use converter function to convert data
+        func = CONVERTERS[converter]
+        docs = func(
+            input_data,
+            n_sents=n_sents,
+            seg_sents=seg_sents,
+            append_morphology=morphology,
+            merge_subtokens=merge_subtokens,
+            lang=lang,
+            model=model,
+            no_print=no_print,
+            ner_map=ner_map,
         )
-    if not input_path.exists():
-        msg.fail("Input file not found", input_path, exits=1)
-    if output_dir != "-" and not Path(output_dir).exists():
-        msg.fail("Output directory not found", output_dir, exits=1)
-    input_data = input_path.open("r", encoding="utf-8").read()
-    if converter == "auto":
-        converter = input_path.suffix[1:]
-    if converter == "ner" or converter == "iob":
-        converter_autodetect = autodetect_ner_format(input_data)
-        if converter_autodetect == "ner":
-            msg.info("Auto-detected token-per-line NER format")
-            converter = converter_autodetect
-        elif converter_autodetect == "iob":
-            msg.info("Auto-detected sentence-per-line NER format")
-            converter = converter_autodetect
+        suffix = f".{file_type}"
+        subpath = input_loc.relative_to(input_path)
+        output_file = (output_dir / subpath).with_suffix(suffix)
+        if not output_file.parent.exists():
+            output_file.parent.mkdir(parents=True)
+        if file_type == "json":
+            data = docs2json(docs)
+            srsly.write_json(output_file, docs2json(docs))
         else:
-            msg.warn(
-                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
-            )
-    if converter not in CONVERTERS:
-        msg.fail(f"Can't find converter for {converter}", exits=1)
-    ner_map = None
-    if ner_map_path is not None:
-        ner_map = srsly.read_json(ner_map_path)
-    # Use converter function to convert data
-    func = CONVERTERS[converter]
-    docs = func(
-        input_data,
-        n_sents=n_sents,
-        seg_sents=seg_sents,
-        append_morphology=morphology,
-        merge_subtokens=merge_subtokens,
-        lang=lang,
-        model=model,
-        no_print=no_print,
-        ner_map=ner_map,
-    )
-    if write_json:
-        data = docs2json(docs)
-    else:
-        data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes()
- 
-    if output_dir == "-":
-        if write_json:
-            srsly.write_json("-", data)
-        else:
-            sys.stdout.write(data)
-    else:
-        # Export data to a file
-        if write_json:
-            suffix = f".{file_type}"
-            output_file = output_dir / input_path.parts[-1].with_suffix(suffix)
-            srsly.write_json(output_file, data)
-        else:
-            output_file = output_dir / input_path.parts[-1].with_suffix("spacy")
+            data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes()
             with output_file.open("wb") as file_:
                 file_.write(data)
-        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
+        msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
 
 
 def autodetect_ner_format(input_data):
@@ -148,3 +114,102 @@ def autodetect_ner_format(input_data):
     if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
         return "iob"
     return None
+
+
+def walk_directory(path):
+    if not path.is_dir():
+        return [path]
+    paths = [path]
+    locs = []
+    seen = set()
+    for path in paths:
+        if str(path) in seen:
+            continue
+        seen.add(str(path))
+        if path.parts[-1].startswith("."):
+            continue
+        elif path.is_dir():
+            paths.extend(path.iterdir())
+        else:
+            locs.append(path)
+    return locs
+
+
+def verify_cli_args(
+    msg,
+    input_path,
+    output_dir,
+    file_type,
+    n_sents,
+    seg_sents,
+    model,
+    morphology,
+    merge_subtokens,
+    converter,
+    ner_map,
+    lang
+):
+    if converter == "ner" or converter == "iob":
+        input_data = input_path.open("r", encoding="utf-8").read()
+        converter_autodetect = autodetect_ner_format(input_data)
+        if converter_autodetect == "ner":
+            msg.info("Auto-detected token-per-line NER format")
+            converter = converter_autodetect
+        elif converter_autodetect == "iob":
+            msg.info("Auto-detected sentence-per-line NER format")
+            converter = converter_autodetect
+        else:
+            msg.warn(
+                "Can't automatically detect NER format. Conversion may not",
+                "succeed. See https://spacy.io/api/cli#convert"
+            )
+    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+        # TODO: support msgpack via stdout in srsly?
+        msg.fail(
+            f"Can't write .{file_type} data to stdout",
+            "Please specify an output directory.",
+            exits=1,
+        )
+    if not input_path.exists():
+        msg.fail("Input file not found", input_path, exits=1)
+    if output_dir != "-" and not Path(output_dir).exists():
+        msg.fail("Output directory not found", output_dir, exits=1)
+    if input_path.is_dir():
+        input_locs = walk_directory(input_path)
+        if len(input_locs) == 0:
+            msg.fail("No input files in directory", input_path, exits=1)
+        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+        if len(file_types) >= 2:
+            file_types = ",".join(file_types)
+            msg.fail("All input files must be same type", file_types, exits=1)
+        if converter == "auto":
+            converter = file_types[0]
+    else:
+        converter = input_path.suffix[1:]
+    if converter not in CONVERTERS:
+        msg.fail(f"Can't find converter for {converter}", exits=1)
+    return converter
+ 
+
+def _get_converter(msg, converter, input_path):
+    if input_path.is_dir():
+        input_path = walk_directory(input_path)[0]
+    if converter == "auto":
+        converter = input_path.suffix[1:]
+    if converter == "ner" or converter == "iob":
+        with input_path.open() as file_:
+            input_data = file_.read()
+        converter_autodetect = autodetect_ner_format(input_data)
+        if converter_autodetect == "ner":
+            msg.info("Auto-detected token-per-line NER format")
+            converter = converter_autodetect
+        elif converter_autodetect == "iob":
+            msg.info("Auto-detected sentence-per-line NER format")
+            converter = converter_autodetect
+        else:
+            msg.warn(
+                "Can't automatically detect NER format. "
+                "Conversion may not succeed. "
+                "See https://spacy.io/api/cli#convert"
+            )
+    return converter

From 476bcd4c5312ded0dfe06fe0d69687201c318124 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 15:55:57 +0200
Subject: [PATCH 13/49] Fix import

---
 spacy/gold/converters/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py
index 6ccc0f8f5..c1b4b1566 100644
--- a/spacy/gold/converters/__init__.py
+++ b/spacy/gold/converters/__init__.py
@@ -1,6 +1,7 @@
 from .iob2docs import iob2docs # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
 from .jsonl2docs import ner_jsonl2docs  # noqa: F401
+from .json2docs import json2docs
 
 # TODO: Update this one
 #from .conllu2docs import conllu2docs  # noqa: F401

From 91fa2f112671e728706bb009cc0ceb9faaa06d96 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 15:56:05 +0200
Subject: [PATCH 14/49] Fix docbin

---
 spacy/tokens/_serialize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 7bf3faab3..8f3e942e3 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -60,7 +60,7 @@ class DocBin(object):
         self.strings = set()
         self.store_user_data = store_user_data
         for doc in docs:
-            self.add(docs)
+            self.add(doc)
 
     def __len__(self):
         """RETURNS: The number of Doc objects added to the DocBin."""

From b7a366b435328b7f0e87cbfb11d5780a12980cd8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 15:56:16 +0200
Subject: [PATCH 15/49] Fix compile in ArcEager

---
 spacy/syntax/arc_eager.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 1512955a5..0dfcbf885 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -513,7 +513,6 @@ cdef class ArcEager(TransitionSystem):
         keeps = [i for i, s in enumerate(states) if not s.is_final()]
         states = [states[i] for i in keeps]
         golds = [ArcEagerGold(self, states[i], examples[i]) for i in keeps]
-        cdef StateClass s
         n_steps = sum([len(s.queue) * 4 for s in states])
         return states, golds, n_steps
 

From 3241acbe0b8a60c4cddd57f5f19bae20a19a31c3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 15:56:28 +0200
Subject: [PATCH 16/49] Fix import

---
 spacy/gold/converters/iob2docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/gold/converters/iob2docs.py b/spacy/gold/converters/iob2docs.py
index 2addc1af4..7901569fa 100644
--- a/spacy/gold/converters/iob2docs.py
+++ b/spacy/gold/converters/iob2docs.py
@@ -3,7 +3,7 @@ from wasabi import Printer
 from ...gold import iob_to_biluo, tags_to_entities
 from ...util import minibatch
 from .util import merge_sentences
-from .conll_ner2json import n_sents_info
+from .conll_ner2docs import n_sents_info
 
 
 def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):

From f5780cb160d1787d900bc1ca5f8795958a0474fb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 15:59:39 +0200
Subject: [PATCH 17/49] Serialize all attrs by default

---
 spacy/tokens/_serialize.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 8f3e942e3..3072787ae 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -9,6 +9,19 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
 
 
+ALL_ATTRS = (
+    "ORTH",
+    "TAG",
+    "HEAD",
+    "DEP",
+    "SENT_START",
+    "ENT_IOB",
+    "ENT_TYPE",
+    "LEMMA",
+    "MORPH"
+)
+
+
 class DocBin(object):
     """Pack Doc objects for binary serialization.
 
@@ -39,7 +52,7 @@ class DocBin(object):
     document from the DocBin.
     """
 
-    def __init__(self, attrs=None, store_user_data=False, docs=[]):
+    def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
         """Create a DocBin object to hold serialized annotations.
 
         attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
@@ -49,7 +62,6 @@ class DocBin(object):
 
         DOCS: https://spacy.io/api/docbin#init
         """
-        attrs = attrs or []
         attrs = sorted([intify_attr(attr) for attr in attrs])
         self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
         self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]

From 5d89b1840ec9c3556d55bfcfedbf77bfe4ebb249 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 16:00:14 +0200
Subject: [PATCH 18/49] Update converter

---
 spacy/cli/convert.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 4cf960379..3b3aa0b91 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -25,17 +25,6 @@ CONVERTERS = {
 }
 
 
-ALL_ATTRS = [
-    "ORTH",
-    "TAG",
-    "HEAD",
-    "DEP",
-    "SENT_START",
-    "ENT_IOB",
-    "ENT_TYPE",
-    "LEMMA",
-    "MORPH",
-]
 # File types
 FILE_TYPES = ("json", "jsonl", "msg")
 FILE_TYPES_STDOUT = ("json", "jsonl")
@@ -91,7 +80,7 @@ def convert(
             data = docs2json(docs)
             srsly.write_json(output_file, docs2json(docs))
         else:
-            data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes()
+            data = DocBin(docs=docs).to_bytes()
             with output_file.open("wb") as file_:
                 file_.write(data)
         msg.good(f"Generated output file ({len(docs)} documents): {output_file}")

From f1756a6a222c99939d9433c64574a648df701edb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 16:02:40 +0200
Subject: [PATCH 19/49] Remove jsonl converter

---
 spacy/cli/convert.py                |  2 --
 spacy/gold/converters/__init__.py   |  1 -
 spacy/gold/converters/jsonl2docs.py | 51 -----------------------------
 3 files changed, 54 deletions(-)
 delete mode 100644 spacy/gold/converters/jsonl2docs.py

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 3b3aa0b91..f4bddac39 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -6,7 +6,6 @@ import sys
 
 from ..tokens import DocBin
 from ..gold.converters import iob2docs, conll_ner2docs, json2docs
-from ..gold.converters import ner_jsonl2docs
 
 
 # Converters are matched by file extension except for ner/iob, which are
@@ -20,7 +19,6 @@ CONVERTERS = {
     #"conll": conllu2docs, TODO
     "ner": conll_ner2docs,
     "iob": iob2docs,
-    "jsonl": ner_jsonl2docs,
     "json": json2docs,
 }
 
diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py
index c1b4b1566..0a1242fb4 100644
--- a/spacy/gold/converters/__init__.py
+++ b/spacy/gold/converters/__init__.py
@@ -1,6 +1,5 @@
 from .iob2docs import iob2docs # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
-from .jsonl2docs import ner_jsonl2docs  # noqa: F401
 from .json2docs import json2docs
 
 # TODO: Update this one
diff --git a/spacy/gold/converters/jsonl2docs.py b/spacy/gold/converters/jsonl2docs.py
deleted file mode 100644
index 8639a11b9..000000000
--- a/spacy/gold/converters/jsonl2docs.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import srsly
-
-from ...gold import docs_to_json
-from ...util import get_lang_class, minibatch
-
-
-def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
-    if lang is None:
-        raise ValueError("No --lang specified, but tokenization required")
-    docs = []
-    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
-    nlp = get_lang_class(lang)()
-    sentencizer = nlp.create_pipe("sentencizer")
-    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
-        docs = []
-        # TODO: Should we be merging these? We're disrespecting the n_sents
-        # currently.
-        for record in batch:
-            raw_text = record["text"]
-            if "entities" in record:
-                ents = record["entities"]
-            else:
-                ents = record["spans"]
-            ents = [(e["start"], e["end"], e["label"]) for e in ents]
-            doc = nlp.make_doc(raw_text)
-            sentencizer(doc)
-            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
-            doc.ents = _cleanup_spans(spans)
-            docs.append(doc)
-    return docs
-
-
-def _cleanup_spans(spans):
-    output = []
-    seen = set()
-    for span in spans:
-        if span is not None:
-            # Trim whitespace
-            while len(span) and span[0].is_space:
-                span = span[1:]
-            while len(span) and span[-1].is_space:
-                span = span[:-1]
-            if not len(span):
-                continue
-            for i in range(span.start, span.end):
-                if i in seen:
-                    break
-            else:
-                output.append(span)
-                seen.update(range(span.start, span.end))
-    return output

From 7360d3db72e6663dd56c02f3dcdbf3874ebdc872 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 16:02:53 +0200
Subject: [PATCH 20/49] Add json2docs converter

---
 spacy/gold/converters/json2docs.py | 38 ++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 spacy/gold/converters/json2docs.py

diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py
new file mode 100644
index 000000000..98219bb04
--- /dev/null
+++ b/spacy/gold/converters/json2docs.py
@@ -0,0 +1,38 @@
+import tempfile
+import contextlib
+import shutil
+from pathlib import Path
+from ..gold_io import read_json_file
+from ..example import annotations2doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.xx import MultiLanguage
+
+@contextlib.contextmanager
+def make_tempdir():
+    d = Path(tempfile.mkdtemp())
+    yield d
+    shutil.rmtree(str(d))
+
+
+def json2docs(
+    input_data,
+    model=None,
+    **kwargs
+):
+    nlp = load_model(model) if model is not None else MultiLanguage()
+    docs = []
+    with make_tempdir() as tmp_dir:
+        json_path = Path(tmp_dir) / "data.json"
+        with (json_path).open("w") as file_:
+            file_.write(input_data)
+        for json_annot in read_json_file(json_path):
+            example_dict = _fix_legacy_dict_data(json_annot)
+            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
+            doc = annotations2doc(
+                nlp.vocab,
+                tok_dict,
+                doc_dict
+            )
+            docs.append(doc)
+    return docs

From 0de361cd00f7a841b112457f07800a110073bf77 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 18:31:07 +0200
Subject: [PATCH 21/49] Draft Corpus class for DocBin

---
 spacy/gold/corpus_docbin.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py
index 3ebaa7376..a9562944c 100644
--- a/spacy/gold/corpus_docbin.py
+++ b/spacy/gold/corpus_docbin.py
@@ -5,7 +5,7 @@ from .example import Example
 from ..tokens import DocBin
 
 
-class GoldCorpus(object):
+class Corpus:
     """An annotated corpus, using the JSON file format. Manages
     annotations for tagging, dependency parsing and NER.
 
@@ -38,7 +38,7 @@ class GoldCorpus(object):
                 continue
             elif path.is_dir():
                 paths.extend(path.iterdir())
-            elif path.parts[-1].endswith(".spacy")
+            elif path.parts[-1].endswith(".spacy"):
                 locs.append(path)
         return locs
 

From 11fa0658f739b31effadfef5c2f277674fc1a7b8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 20:12:19 +0200
Subject: [PATCH 22/49] Work on train script

---
 spacy/cli/train.py | 151 ++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 85 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index fb4347158..64eb89d13 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -12,7 +12,7 @@ import thinc.schedules
 from thinc.api import Model, use_pytorch_for_gpu_memory
 import random
 
-from ..gold import GoldCorpus
+from ..gold.corpus_docbin import Corpus
 from ..lookups import Lookups
 from .. import util
 from ..errors import Errors
@@ -148,26 +148,8 @@ def train_cli(
     command.
     """
     util.set_env_log(verbose)
+    verify_cli_args(**locals())
 
-    # Make sure all files and paths exists if they are needed
-    if not config_path or not config_path.exists():
-        msg.fail("Config file not found", config_path, exits=1)
-    if not train_path or not train_path.exists():
-        msg.fail("Training data not found", train_path, exits=1)
-    if not dev_path or not dev_path.exists():
-        msg.fail("Development data not found", dev_path, exits=1)
-    if output_path is not None:
-        if not output_path.exists():
-            output_path.mkdir()
-            msg.good(f"Created output directory: {output_path}")
-        elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
-            msg.warn(
-                "Output directory is not empty.",
-                "This can lead to unintended side effects when saving the model. "
-                "Please use an empty directory or a different path instead. If "
-                "the specified output path doesn't exist, the directory will be "
-                "created for you.",
-            )
     if raw_text is not None:
         raw_text = list(srsly.read_jsonl(raw_text))
     tag_map = {}
@@ -176,9 +158,7 @@ def train_cli(
 
     weights_data = None
     if init_tok2vec is not None:
-        if not init_tok2vec.exists():
-            msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
-        with init_tok2vec.open("rb") as file_:
+       with init_tok2vec.open("rb") as file_:
             weights_data = file_.read()
 
     if use_gpu >= 0:
@@ -198,6 +178,7 @@ def train_cli(
     )
 
 
+
 def train(
     config_path,
     data_paths,
@@ -221,60 +202,9 @@ def train(
     nlp = util.load_model_from_config(nlp_config)
     optimizer = training["optimizer"]
     limit = training["limit"]
-    msg.info("Loading training corpus")
-    corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
-    # verify textcat config
+    corpus = Corpus(data_paths["train"], data_paths["dev"], limit=limit)
     if "textcat" in nlp_config["pipeline"]:
-        textcat_labels = set(nlp.get_pipe("textcat").labels)
-        textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"]
-
-        # check whether the setting 'exclusive_classes' corresponds to the provided training data
-        if textcat_multilabel:
-            multilabel_found = False
-            for eg in corpus.train_annotations:
-                cats = eg.reference.cats
-                textcat_labels.update(cats.keys())
-                if list(cats.values()).count(1.0) != 1:
-                    multilabel_found = True
-            if not multilabel_found:
-                msg.warn(
-                    "The textcat training instances look like they have "
-                    "mutually exclusive classes. Set 'exclusive_classes' "
-                    "to 'true' in the config to train a classifier with "
-                    "mutually exclusive classes more accurately."
-                )
-        else:
-            for eg in corpus.train_annotations:
-                cats = eg.reference.cats
-                textcat_labels.update(cats.keys())
-                if list(cats.values()).count(1.0) != 1:
-                    msg.fail(
-                        "Some textcat training instances do not have exactly "
-                        "one positive label. Set 'exclusive_classes' "
-                        "to 'false' in the config to train a classifier with classes "
-                        "that are not mutually exclusive."
-                    )
-        msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
-        nlp.get_pipe("textcat").labels = tuple(textcat_labels)
-
-        # if 'positive_label' is provided: double check whether it's in the data and the task is binary
-        if nlp_config["pipeline"]["textcat"].get("positive_label", None):
-            textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
-            pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
-            if pos_label not in textcat_labels:
-                msg.fail(
-                    f"The textcat's 'positive_label' config setting '{pos_label}' "
-                    f"does not match any label in the training data.",
-                    exits=1,
-                )
-            if len(textcat_labels) != 2:
-                msg.fail(
-                    f"A textcat 'positive_label' '{pos_label}' was "
-                    f"provided for training data that does not appear to be a "
-                    f"binary classification problem with two labels.",
-                    exits=1,
-                )
-
+        verify_textcat_config(nlp, nlp_config)
     if training.get("resume", False):
         msg.info("Resuming training")
         nlp.resume_training()
@@ -312,6 +242,7 @@ def train(
             )
         tok2vec.from_bytes(weights_data)
 
+    msg.info("Loading training corpus")
     train_batches = create_train_batches(nlp, corpus, training)
     evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
 
@@ -368,15 +299,7 @@ def train(
 def create_train_batches(nlp, corpus, cfg):
     epochs_todo = cfg.get("max_epochs", 0)
     while True:
-        train_examples = list(
-            corpus.train_dataset(
-                nlp,
-                orth_variant_level=cfg["orth_variant_level"],
-                gold_preproc=cfg["gold_preproc"],
-                max_length=cfg["max_length"],
-                ignore_misaligned=True,
-            )
-        )
+        train_examples = list(corpus.train_dataset(nlp))
 
         if len(train_examples) == 0:
             raise ValueError(Errors.E988)
@@ -598,3 +521,61 @@ def update_meta(training, nlp, info):
         nlp.meta["performance"][metric] = info["other_scores"][metric]
     for pipe_name in nlp.pipe_names:
         nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
+
+
+def verify_cli_args(
+    train_path,
+    dev_path,
+    config_path,
+    output_path=None,
+    init_tok2vec=None,
+    raw_text=None,
+    verbose=False,
+    use_gpu=-1,
+    tag_map_path=None,
+    omit_extra_lookups=False,
+):
+    # Make sure all files and paths exists if they are needed
+    if not config_path or not config_path.exists():
+        msg.fail("Config file not found", config_path, exits=1)
+    if not train_path or not train_path.exists():
+        msg.fail("Training data not found", train_path, exits=1)
+    if not dev_path or not dev_path.exists():
+        msg.fail("Development data not found", dev_path, exits=1)
+    if output_path is not None:
+        if not output_path.exists():
+            output_path.mkdir()
+            msg.good(f"Created output directory: {output_path}")
+        elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
+            msg.warn(
+                "Output directory is not empty.",
+                "This can lead to unintended side effects when saving the model. "
+                "Please use an empty directory or a different path instead. If "
+                "the specified output path doesn't exist, the directory will be "
+                "created for you.",
+            )
+    if init_tok2vec is not None and not init_tok2vec.exists():
+        msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
+
+
+def verify_textcat_config(nlp, nlp_config):
+    msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels")
+    nlp.get_pipe("textcat").labels = tuple(textcat_labels)
+    # if 'positive_label' is provided: double check whether it's in the data and 
+    # the task is binary
+    if nlp_config["pipeline"]["textcat"].get("positive_label", None):
+        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
+        pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
+        if pos_label not in textcat_labels:
+            msg.fail(
+                f"The textcat's 'positive_label' config setting '{pos_label}' "
+                f"does not match any label in the training data.",
+                exits=1,
+            )
+        if len(textcat_labels) != 2:
+            msg.fail(
+                f"A textcat 'positive_label' '{pos_label}' was "
+                f"provided for training data that does not appear to be a "
+                f"binary classification problem with two labels.",
+                exits=1,
+            )

From 0a8b6631a26da1bf0959bd67d623b955e985dcec Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 20:12:31 +0200
Subject: [PATCH 23/49] Update Corpus

---
 spacy/gold/corpus_docbin.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py
index a9562944c..8ee1e9a6c 100644
--- a/spacy/gold/corpus_docbin.py
+++ b/spacy/gold/corpus_docbin.py
@@ -1,5 +1,6 @@
 import srsly
 from pathlib import Path
+import random
 from .. import util
 from .example import Example
 from ..tokens import DocBin
@@ -11,14 +12,13 @@ class Corpus:
 
     DOCS: https://spacy.io/api/goldcorpus
     """
-    def __init__(self, vocab, train_loc, dev_loc, limit=0):
+    def __init__(self, train_loc, dev_loc, limit=0):
         """Create a GoldCorpus.
 
         train (str / Path): File or directory of training data.
         dev (str / Path): File or directory of development data.
         RETURNS (GoldCorpus): The newly created object.
         """
-        self.vocab = vocab
         self.train_loc = train_loc 
         self.dev_loc = dev_loc
 
@@ -42,7 +42,12 @@ class Corpus:
                 locs.append(path)
         return locs
 
-    def read_docbin(self, locs, limit=0):
+    def make_examples(self, nlp, reference_docs, **kwargs):
+        for reference in reference_docs:
+            predicted = nlp.make_doc(reference.text)
+            yield Example(predicted, reference)
+
+    def read_docbin(self, vocab, locs, limit=0):
         """ Yield training examples as example dicts """
         i = 0
         for loc in locs:
@@ -50,31 +55,26 @@ class Corpus:
             if loc.parts[-1].endswith(".spacy"):
                 with loc.open("rb") as file_:
                     doc_bin = DocBin().from_bytes(file_.read())
-                docs = list(doc_bin.get_docs(self.vocab))
-                assert len(docs) % 2 == 0
-                # Pair up the docs into the (predicted, reference) pairs.
-                for i in range(0, len(docs), 2):
-                    predicted = docs[i]
-                    reference = docs[i+1]
-                    yield Example(predicted, reference)
+                yield from doc_bin.get_docs(vocab)
     
-    def count_train(self):
+    def count_train(self, nlp):
         """Returns count of words in train examples"""
         n = 0
         i = 0
-        for example in self.train_dataset():
+        for example in self.train_dataset(nlp):
             n += len(example.predicted)
             if self.limit and i >= self.limit:
                 break
             i += 1
         return n
 
-    def train_dataset(self):
-        examples = self.read_docbin(self.walk_corpus(self.train_loc))
+    def train_dataset(self, nlp, **kwargs):
+        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
+        examples = list(self.make_examples(nlp, ref_docs, **kwargs))
         random.shuffle(examples)
         yield from examples
 
-    def dev_dataset(self):
-        examples = self.read_docbin(self.walk_corpus(self.dev_loc))
-        random.shuffle(examples)
+    def dev_dataset(self, nlp):
+        ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
+        examples = self.make_examples(nlp, ref_docs, **kwargs)
         yield from examples

From 652f31d3ee1021f528b9b543de1d82b5c59b1262 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 20:12:54 +0200
Subject: [PATCH 24/49] Update DocBin

---
 spacy/tokens/_serialize.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 3072787ae..febfbd670 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -14,7 +14,6 @@ ALL_ATTRS = (
     "TAG",
     "HEAD",
     "DEP",
-    "SENT_START",
     "ENT_IOB",
     "ENT_TYPE",
     "LEMMA",
@@ -112,8 +111,7 @@ class DocBin(object):
         for i in range(len(self.tokens)):
             tokens = self.tokens[i]
             spaces = self.spaces[i]
-            words = [vocab.strings[orth] for orth in tokens[:, orth_col]]
-            doc = Doc(vocab, words=words, spaces=spaces)
+            doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
             doc = doc.from_array(self.attrs, tokens)
             doc.cats = self.cats[i]
             if self.store_user_data:

From fa86aa581d67900929d2bcbb09efa93eb5ea7abb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 20:15:21 +0200
Subject: [PATCH 25/49] Allocate Doc before starting to add words

---
 spacy/tokens/doc.pyx | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f9e7c97dd..686f3be54 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -3,6 +3,7 @@ cimport cython
 cimport numpy as np
 from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
+from libc.stdint cimport int32_t, uint64_t
 
 from collections import Counter
 import numpy
@@ -186,7 +187,7 @@ cdef class Doc:
         DOCS: https://spacy.io/api/doc#init
         """
         self.vocab = vocab
-        size = 20
+        size = max(20, (len(words) if words is not None else 0))
         self.mem = Pool()
         # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
         # However, we need to remember the true starting places, so that we can
@@ -211,7 +212,6 @@ cdef class Doc:
         self.user_data = {} if user_data is None else user_data
         self._vector = None
         self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
-        cdef unicode orth
         cdef bint has_space
         if orths_and_spaces is None and words is not None:
             if spaces is None:
@@ -219,19 +219,22 @@ cdef class Doc:
             elif len(spaces) != len(words):
                 raise ValueError(Errors.E027)
             orths_and_spaces = zip(words, spaces)
+        cdef const LexemeC* lexeme
         if orths_and_spaces is not None:
+            orths_and_spaces = list(orths_and_spaces)
             for orth_space in orths_and_spaces:
                 if isinstance(orth_space, unicode):
-                    orth = orth_space
+                    lexeme = self.vocab.get(self.mem, orth_space)
                     has_space = True
                 elif isinstance(orth_space, bytes):
                     raise ValueError(Errors.E028.format(value=orth_space))
+                elif isinstance(orth_space[0], unicode):
+                    lexeme = self.vocab.get(self.mem, orth_space[0])
+                    has_space = orth_space[1]
                 else:
-                    orth, has_space = orth_space
-                # Note that we pass self.mem here --- we have ownership, if LexemeC
-                # must be created.
-                self.push_back(
-                    <const LexemeC*>self.vocab.get(self.mem, orth), has_space)
+                    lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
+                    has_space = orth_space[1]
+                self.push_back(lexeme, has_space)
         # Tough to decide on policy for this. Is an empty doc tagged and parsed?
         # There's no information we'd like to add to it, so I guess so?
         if self.length == 0:
@@ -753,6 +756,8 @@ cdef class Doc:
             return dict(counts)
 
     def _realloc(self, new_size):
+        if new_size < self.max_length:
+            return
         self.max_length = new_size
         n = new_size + (PADDING * 2)
         # What we're storing is a "padded" array. We've jumped forward PADDING

From 6d821b2e5559151f28880da0ff4a90e391e87657 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 20:17:13 +0200
Subject: [PATCH 26/49] Make doc.from_array several times faster

---
 spacy/tokens/doc.pyx | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 686f3be54..72a16b854 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -806,12 +806,14 @@ cdef class Doc:
 
         if SENT_START in attrs and HEAD in attrs:
             raise ValueError(Errors.E032)
-        cdef int i, col, abs_head_index
+        cdef int i, col
+        cdef int32_t abs_head_index
         cdef attr_id_t attr_id
         cdef TokenC* tokens = self.c
         cdef int length = len(array)
         if length != len(self):
             raise ValueError("Cannot set array values longer than the document.")
+
         # Get set up for fast loading
         cdef Pool mem = Pool()
         cdef int n_attrs = len(attrs)
@@ -822,33 +824,52 @@ cdef class Doc:
             attr_ids[i] = attr_id
         if len(array.shape) == 1:
             array = array.reshape((array.size, 1))
+        cdef np.ndarray transposed_array = numpy.ascontiguousarray(array.T)
+        values = <const uint64_t*>transposed_array.data
+        stride = transposed_array.shape[1]
         # Check that all heads are within the document bounds
         if HEAD in attrs:
             col = attrs.index(HEAD)
             for i in range(length):
                 # cast index to signed int
-                abs_head_index = numpy.int32(array[i, col]) + i
+                abs_head_index = <int32_t>values[col * stride + i]
+                abs_head_index += i
                 if abs_head_index < 0 or abs_head_index >= length:
-                    raise ValueError(Errors.E190.format(index=i, value=array[i, col], rel_head_index=numpy.int32(array[i, col])))
+                    raise ValueError(
+                        Errors.E190.format(
+                            index=i,
+                            value=array[i, col],
+                            rel_head_index=abs_head_index-i
+                        )
+                    )
         # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
         if TAG in attrs:
             col = attrs.index(TAG)
             for i in range(length):
-                if array[i, col] != 0:
-                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
+                value = values[col * stride + i]
+                if value != 0:
+                    self.vocab.morphology.assign_tag(&tokens[i], value)
         # Verify ENT_IOB are proper integers
         if ENT_IOB in attrs:
             iob_strings = Token.iob_strings()
             col = attrs.index(ENT_IOB)
+            n_iob_strings = len(iob_strings)
             for i in range(length):
-                if array[i, col] not in range(0, len(iob_strings)):
-                    raise ValueError(Errors.E982.format(values=iob_strings, value=array[i, col]))
+                value = values[col * stride + i]
+                if value < 0 or value >= n_iob_strings:
+                    raise ValueError(
+                        Errors.E982.format(
+                            values=iob_strings,
+                            value=value
+                        )
+                    )
         # Now load the data
         for i in range(length):
             token = &self.c[i]
             for j in range(n_attrs):
                 if attr_ids[j] != TAG:
-                    Token.set_struct_attr(token, attr_ids[j], array[i, j])
+                    value = values[j * stride + i]
+                    Token.set_struct_attr(token, attr_ids[j], value)
         # Set flags
         self.is_parsed = bool(self.is_parsed or HEAD in attrs)
         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)

From 450c6fe39c6e3b32bc00cd20b844e37dd0adee5a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 21:49:06 +0200
Subject: [PATCH 27/49] Update train.py

---
 spacy/cli/train.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 64eb89d13..3420c96fa 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -210,7 +210,8 @@ def train(
         nlp.resume_training()
     else:
         msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
-        nlp.begin_training(lambda: corpus.train_dataset(nlp))
+        train_examples = list(corpus.train_dataset(nlp, shuffle=False))
+        nlp.begin_training(lambda: train_examples)
 
     # Update tag map with provided mapping
     nlp.vocab.morphology.tag_map.update(tag_map)
@@ -280,11 +281,14 @@ def train(
                 eg.reference = None
                 eg.predicted = None
     except Exception as e:
-        msg.warn(
-            f"Aborting and saving the final best model. "
-            f"Encountered exception: {str(e)}",
-            exits=1,
-        )
+        if output_path is not None:
+            msg.warn(
+                f"Aborting and saving the final best model. "
+                f"Encountered exception: {str(e)}",
+                exits=1,
+            )
+        else:
+            raise e
     finally:
         if output_path is not None:
             final_model_path = output_path / "model-final"
@@ -300,7 +304,6 @@ def create_train_batches(nlp, corpus, cfg):
     epochs_todo = cfg.get("max_epochs", 0)
     while True:
         train_examples = list(corpus.train_dataset(nlp))
-
         if len(train_examples) == 0:
             raise ValueError(Errors.E988)
         random.shuffle(train_examples)

From 396dd60b3a9ef62f27bf406aff82b167ed8c63a3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 21:49:15 +0200
Subject: [PATCH 28/49] Fix Corpus

---
 spacy/gold/corpus_docbin.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus_docbin.py
index 8ee1e9a6c..750217c8c 100644
--- a/spacy/gold/corpus_docbin.py
+++ b/spacy/gold/corpus_docbin.py
@@ -68,10 +68,12 @@ class Corpus:
             i += 1
         return n
 
-    def train_dataset(self, nlp, **kwargs):
+    def train_dataset(self, nlp, shuffle=True, **kwargs):
         ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
-        examples = list(self.make_examples(nlp, ref_docs, **kwargs))
-        random.shuffle(examples)
+        examples = self.make_examples(nlp, ref_docs, **kwargs)
+        if shuffle:
+            examples = list(examples)
+            random.shuffle(examples)
         yield from examples
 
     def dev_dataset(self, nlp):

From 2bcb5881d70d550afb09fe7dcc9cda8e260ca53a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 21:49:31 +0200
Subject: [PATCH 29/49] Fix parser model

---
 spacy/syntax/_parser_model.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index eef5723f3..d3093d60d 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -261,7 +261,7 @@ class ParserStepModel(Model):
 
     def get_token_ids(self, states):
         cdef StateClass state
-        states = [state for state in states() if not state.is_final()]
+        states = [state for state in states if not state.is_final()]
         cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
                                           dtype='i', order='C')
         ids.fill(-1)

From 0c10831b14edddd9c6491c0edfd7ab81bcdc7e98 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 21:49:46 +0200
Subject: [PATCH 30/49] Start debugging arc_eager oracle

---
 spacy/syntax/arc_eager.pyx | 48 +++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 0dfcbf885..b0fedd6c4 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -76,18 +76,27 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp
     gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
 
     cand_to_gold = example.alignment.cand_to_gold
+    gold_to_cand = example.alignment.cand_to_gold
     cdef TokenC ref_tok
     for cand_i in range(example.x.length):
         gold_i = cand_to_gold[cand_i]
-        if cand_i is not None: # Alignment found
+        if gold_i is not None: # Alignment found
             ref_tok = example.y.c[gold_i]
-            gs.heads[cand_i] = ref_tok.head
-            gs.labels[cand_i] = ref_tok.dep
-            gs.state_bits[cand_i] = set_state_flag(
-                gs.state_bits[cand_i],
-                HEAD_UNKNOWN,
-                0
-            )
+            gold_head = gold_to_cand[ref_tok.head + gold_i]
+            if gold_head is not None:
+                gs.heads[cand_i] = gold_head
+                gs.labels[cand_i] = ref_tok.dep
+                gs.state_bits[cand_i] = set_state_flag(
+                    gs.state_bits[cand_i],
+                    HEAD_UNKNOWN,
+                    0
+                )
+            else:
+                gs.state_bits[cand_i] = set_state_flag(
+                    gs.state_bits[cand_i],
+                    HEAD_UNKNOWN,
+                    1
+                )
         else:
             gs.state_bits[cand_i] = set_state_flag(
                 gs.state_bits[cand_i],
@@ -135,6 +144,8 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp
 
 cdef class ArcEagerGold:
     cdef GoldParseStateC c
+    cdef Pool mem
+
     def __init__(self, ArcEager moves, StateClass stcls, Example example):
         self.mem = Pool()
         self.c = create_gold_state(self.mem, stcls, example)
@@ -610,9 +621,8 @@ cdef class ArcEager(TransitionSystem):
                 output[i] = is_valid[self.c[i].move]
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
-                       StateClass stcls, Example example) except -1:
-        cdef Pool mem = Pool()
-        gold_state = create_gold_state(mem, stcls, example)
+                       StateClass stcls, gold) except -1:
+        gold_state = (<ArcEagerGold>gold).c
         cdef int i, move
         cdef attr_t label
         cdef label_cost_func_t[N_MOVES] label_cost_funcs
@@ -643,16 +653,16 @@ cdef class ArcEager(TransitionSystem):
                 label = self.c[i].label
                 if move_costs[move] == 9000:
                     move_costs[move] = move_cost_funcs[move](stcls, &gold_state)
-                costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold_state, label)
+                move_cost = move_costs[move]
+                label_cost = label_cost_funcs[move](stcls, &gold_state, label)
+                costs[i] = move_cost + label_cost
                 n_gold += costs[i] <= 0
+                print(move, label, costs[i])
             else:
                 is_valid[i] = False
                 costs[i] = 9000
         if n_gold < 1:
-            # Check projectivity --- leading cause
-            if is_nonproj_tree(example.get_field("HEAD")):
-                raise ValueError(Errors.E020)
-            else:
-                failure_state = stcls.print_state([t.text for t in example])
-                raise ValueError(Errors.E021.format(n_actions=self.n_moves,
-                                                    state=failure_state))
+            raise ValueError
+            #failure_state = stcls.print_state([t.text for t in example])
+            #raise ValueError(
+            #    Errors.E021.format(n_actions=self.n_moves, state=failure_state))

From 52edb24f075de6c413e752a7cea712817c2b730a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 21:50:06 +0200
Subject: [PATCH 31/49] Update header

---
 spacy/syntax/transition_system.pxd | 2 +-
 spacy/syntax/transition_system.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd
index 21752b15f..836c08168 100644
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@@ -52,4 +52,4 @@ cdef class TransitionSystem:
     cdef int set_valid(self, int* output, const StateC* st) nogil
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
-                       StateClass state, Example example) except -1
+                       StateClass state, gold) except -1
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 687c234d0..319550161 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -115,7 +115,7 @@ cdef class TransitionSystem:
             is_valid[i] = self.c[i].is_valid(st, self.c[i].label)
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
-                       StateClass stcls, Example example) except -1:
+                       StateClass stcls, gold) except -1:
         raise NotImplementedError
 
     def get_class_name(self, int clas):

From 6af99f2f2d156181fe7b210ffe74fc9679e436be Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 21:50:17 +0200
Subject: [PATCH 32/49] Fix parser declaration

---
 spacy/syntax/nn_parser.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 22e0e7995..f36b10bcc 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -273,6 +273,7 @@ cdef class Parser:
             [eg.predicted for eg in examples])
         states, golds, max_steps = self.moves.init_gold_batch(examples)
         all_states = list(states)
+        states_golds = zip(states, golds)
         for _ in range(max_steps):
             if not states_golds:
                 break
@@ -353,7 +354,6 @@ cdef class Parser:
 
     def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
         cdef StateClass state
-        cdef Example example
         cdef Pool mem = Pool()
         cdef int i
 

From 0b23fd3891e14ff8d6d0f071ca3dc1d0a50a47e4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 21:52:57 +0200
Subject: [PATCH 33/49] Xfail some tests

---
 spacy/tests/test_cli.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 132f7ac9f..4b244a3ce 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,10 +1,13 @@
 import pytest
 
 from spacy.lang.en import English
-from spacy.cli.converters import conllu2json, iob2json, conll_ner2json
+from spacy.gold.converters import iob2docs, conll_ner2docs
 from spacy.cli.pretrain import make_docs
+# TODO
+# from spacy.gold.converters import conllu2docs
 
 
+@pytest.mark.xfail
 def test_cli_converters_conllu2json():
     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
     lines = [
@@ -29,6 +32,7 @@ def test_cli_converters_conllu2json():
     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
 
 
+@pytest.mark.xfail
 @pytest.mark.parametrize(
     "lines",
     [
@@ -66,6 +70,7 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
     assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 
 
+@pytest.mark.xfail
 def test_cli_converters_conllu2json_subtokens():
     # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
     lines = [
@@ -109,6 +114,7 @@ def test_cli_converters_conllu2json_subtokens():
     assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
 
 
+@pytest.mark.xfail
 def test_cli_converters_iob2json():
     lines = [
         "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@@ -132,6 +138,7 @@ def test_cli_converters_iob2json():
         # fmt: on
 
 
+@pytest.mark.xfail
 def test_cli_converters_conll_ner2json():
     lines = [
         "-DOCSTART- -X- O O",

From 095710e40e96c06996bc2798b9d0a1cfba09f979 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:02:32 +0200
Subject: [PATCH 34/49] Skip tests that cause crashes

---
 spacy/tests/parser/test_add_label.py     | 7 ++++++-
 spacy/tests/parser/test_parse.py         | 7 +++++--
 spacy/tests/parser/test_preset_sbd.py    | 4 ++++
 spacy/tests/regression/test_issue4665.py | 5 ++++-
 4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 7d8063242..093d4e266 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -44,6 +44,8 @@ def _train_parser(parser):
     return parser
 
 
+# Segfaulting due to refactor. Need to fix.
+@pytest.mark.skip
 def test_add_label(parser):
     parser = _train_parser(parser)
     parser.add_label("right")
@@ -62,6 +64,8 @@ def test_add_label(parser):
     assert doc[2].dep_ == "left"
 
 
+# segfaulting due to refactor. need to fix.
+@pytest.mark.skip
 def test_add_label_deserializes_correctly():
     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
     ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
@@ -78,7 +82,8 @@ def test_add_label_deserializes_correctly():
     for i in range(ner1.moves.n_moves):
         assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
 
-
+# segfaulting due to refactor. need to fix.
+@pytest.mark.skip
 @pytest.mark.parametrize(
     "pipe_cls,n_moves,model",
     [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 6e13d3044..ab9228533 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -46,7 +46,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
     assert doc[0].dep != 0
 
 
-@pytest.mark.xfail
+@pytest.mark.skip # Segfault
 def test_parser_initial(en_tokenizer, en_parser):
     text = "I ate the pizza with anchovies."
     # heads = [1, 0, 1, -2, -3, -1, -5]
@@ -59,6 +59,7 @@ def test_parser_initial(en_tokenizer, en_parser):
     assert tokens[3].head.i == 3
 
 
+@pytest.mark.skip # Segfault
 def test_parser_parse_subtrees(en_tokenizer, en_parser):
     text = "The four wheels on the bus turned quickly"
     heads = [2, 1, 4, -1, 1, -2, 0, -1]
@@ -73,6 +74,7 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
     assert len(list(doc[2].subtree)) == 6
 
 
+@pytest.mark.skip # Segfault
 def test_parser_merge_pp(en_tokenizer):
     text = "A phrase with another phrase occurs"
     heads = [1, 4, -1, 1, -2, 0]
@@ -91,7 +93,7 @@ def test_parser_merge_pp(en_tokenizer):
     assert doc[3].text == "occurs"
 
 
-@pytest.mark.xfail
+@pytest.mark.skip # Segfault
 def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
     text = "a b c d e"
 
@@ -166,6 +168,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
     assert tokens[4].head.i == 4
 
 
+@pytest.mark.skip # Segfault
 def test_parser_set_sent_starts(en_vocab):
     # fmt: off
     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index 5a29d84f4..9a2e1cfe8 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -33,12 +33,14 @@ def parser(vocab):
     return parser
 
 
+@pytest.mark.skip # Segfaults
 def test_no_sentences(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc = parser(doc)
     assert len(list(doc.sents)) >= 1
 
 
+@pytest.mark.skip # Segfaults
 def test_sents_1(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc[2].sent_start = True
@@ -52,6 +54,7 @@ def test_sents_1(parser):
     assert len(list(doc.sents)) == 2
 
 
+@pytest.mark.skip # Segfaults
 def test_sents_1_2(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc[1].sent_start = True
@@ -60,6 +63,7 @@ def test_sents_1_2(parser):
     assert len(list(doc.sents)) >= 3
 
 
+@pytest.mark.skip # Segfaults
 def test_sents_1_3(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc[1].sent_start = True
diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py
index 721ec0098..cb9279250 100644
--- a/spacy/tests/regression/test_issue4665.py
+++ b/spacy/tests/regression/test_issue4665.py
@@ -1,4 +1,6 @@
-from spacy.cli.converters.conllu2json import conllu2json
+import pytest
+# TODO
+#from spacy.gold.converters.conllu2docs import conllu2docs
 
 input_data = """
 1	[	_	PUNCT	-LRB-	_	_	punct	_	_
@@ -22,6 +24,7 @@ input_data = """
 """
 
 
+@pytest.mark.xfail
 def test_issue4665():
     """
     conllu2json should not raise an exception if the HEAD column contains an

From fd83551eb592b39fe97c0abee68d7e4b51dd53d9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:11:27 +0200
Subject: [PATCH 35/49] Skip test causing segfault

---
 spacy/tests/parser/test_parse.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index ab9228533..80d91e7ae 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -22,6 +22,7 @@ TRAIN_DATA = [
 ]
 
 
+@pytest.mark.skip # Segfault
 def test_parser_root(en_tokenizer):
     text = "i don't have other assistance"
     heads = [3, 2, 1, 0, 1, -2]
@@ -32,8 +33,9 @@ def test_parser_root(en_tokenizer):
         assert t.dep != 0, t.text
 
 
-@pytest.mark.xfail
-@pytest.mark.parametrize("text", ["Hello"])
+#@pytest.mark.xfail
+#@pytest.mark.parametrize("text", ["Hello"])
+@pytest.mark.skip # Segfault
 def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
     tokens = en_tokenizer(text)
     doc = get_doc(
@@ -185,7 +187,7 @@ def test_parser_set_sent_starts(en_vocab):
         for token in sent:
             assert token.head in sent
 
-
+@pytest.mark.skip
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
     nlp = English()

From cfd024536db3a81592aac2343071c5272b62907d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:13:37 +0200
Subject: [PATCH 36/49] Remove GoldCorpus

---
 spacy/gold/corpus.py | 222 -------------------------------------------
 1 file changed, 222 deletions(-)
 delete mode 100644 spacy/gold/corpus.py

diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py
deleted file mode 100644
index c84f8355f..000000000
--- a/spacy/gold/corpus.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import random
-import shutil
-import tempfile
-import srsly
-from pathlib import Path
-import itertools
-from ..tokens import Doc
-from .. import util
-from ..errors import Errors, AlignmentError
-from .gold_io import read_json_file, json_to_annotations
-from .augment import make_orth_variants
-from .example import Example
-
-
-class GoldCorpus(object):
-    """An annotated corpus, using the JSON file format. Manages
-    annotations for tagging, dependency parsing and NER.
-
-    DOCS: https://spacy.io/api/goldcorpus
-    """
-
-    def __init__(self, train, dev, gold_preproc=False, limit=None):
-        """Create a GoldCorpus.
-
-        train (str / Path): File or directory of training data.
-        dev (str / Path): File or directory of development data.
-        RETURNS (GoldCorpus): The newly created object.
-        """
-        self.limit = limit
-        if isinstance(train, str) or isinstance(train, Path):
-            train = self.read_annotations(self.walk_corpus(train))
-            dev = self.read_annotations(self.walk_corpus(dev))
-        # Write temp directory with one doc per file, so we can shuffle and stream
-        self.tmp_dir = Path(tempfile.mkdtemp())
-        self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
-        self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
-
-    def __del__(self):
-        shutil.rmtree(self.tmp_dir)
-
-    @staticmethod
-    def write_msgpack(directory, examples, limit=0):
-        if not directory.exists():
-            directory.mkdir()
-        n = 0
-        for i, ex_dict in enumerate(examples):
-            text = ex_dict["text"]
-            srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
-            n += 1
-            if limit and n >= limit:
-                break
-
-    @staticmethod
-    def walk_corpus(path):
-        path = util.ensure_path(path)
-        if not path.is_dir():
-            return [path]
-        paths = [path]
-        locs = []
-        seen = set()
-        for path in paths:
-            if str(path) in seen:
-                continue
-            seen.add(str(path))
-            if path.parts[-1].startswith("."):
-                continue
-            elif path.is_dir():
-                paths.extend(path.iterdir())
-            elif path.parts[-1].endswith((".json", ".jsonl")):
-                locs.append(path)
-        return locs
-
-    @staticmethod
-    def read_annotations(locs, limit=0):
-        """ Yield training examples as example dicts """
-        i = 0
-        for loc in locs:
-            loc = util.ensure_path(loc)
-            file_name = loc.parts[-1]
-            if file_name.endswith("json"):
-                examples = read_json_file(loc)
-            elif file_name.endswith("jsonl"):
-                gold_tuples = srsly.read_jsonl(loc)
-                first_gold_tuple = next(gold_tuples)
-                gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
-                # TODO: proper format checks with schemas
-                if isinstance(first_gold_tuple, dict):
-                    if first_gold_tuple.get("paragraphs", None):
-                        examples = []
-                        for json_doc in gold_tuples:
-                            examples.extend(json_to_annotations(json_doc))
-                    elif first_gold_tuple.get("doc_annotation", None):
-                        examples = []
-                        for ex_dict in gold_tuples:
-                            doc = ex_dict.get("doc", None)
-                            if doc is None:
-                                doc = ex_dict.get("text", None)
-                            if not (
-                                doc is None
-                                or isinstance(doc, Doc)
-                                or isinstance(doc, str)
-                            ):
-                                raise ValueError(Errors.E987.format(type=type(doc)))
-                            examples.append(ex_dict)
-
-            elif file_name.endswith("msg"):
-                text, ex_dict = srsly.read_msgpack(loc)
-                examples = [ex_dict]
-            else:
-                supported = ("json", "jsonl", "msg")
-                raise ValueError(Errors.E124.format(path=loc, formats=supported))
-            try:
-                for example in examples:
-                    yield example
-                    i += 1
-                    if limit and i >= limit:
-                        return
-            except KeyError as e:
-                msg = "Missing key {}".format(e)
-                raise KeyError(Errors.E996.format(file=file_name, msg=msg))
-            except UnboundLocalError as e:
-                msg = "Unexpected document structure"
-                raise ValueError(Errors.E996.format(file=file_name, msg=msg))
-
-    @property
-    def dev_annotations(self):
-        locs = (self.tmp_dir / "dev").iterdir()
-        yield from self.read_annotations(locs, limit=self.limit)
-
-    @property
-    def train_annotations(self):
-        locs = (self.tmp_dir / "train").iterdir()
-        yield from self.read_annotations(locs, limit=self.limit)
-
-    def count_train(self):
-        """Returns count of words in train examples"""
-        n = 0
-        i = 0
-        for eg_dict in self.train_annotations:
-            n += len(eg_dict["token_annotation"]["words"])
-            if self.limit and i >= self.limit:
-                break
-            i += 1
-        return n
-
-    def train_dataset(
-        self,
-        nlp,
-        gold_preproc=False,
-        max_length=None,
-        orth_variant_level=0.0,
-        ignore_misaligned=False,
-    ):
-        locs = list((self.tmp_dir / "train").iterdir())
-        random.shuffle(locs)
-        train_annotations = self.read_annotations(locs, limit=self.limit)
-        examples = self.iter_examples(
-            nlp,
-            train_annotations,
-            gold_preproc,
-            max_length=max_length,
-            orth_variant_level=orth_variant_level,
-            make_projective=True,
-            ignore_misaligned=ignore_misaligned,
-        )
-        yield from examples
-
-    def train_dataset_without_preprocessing(
-        self, nlp, gold_preproc=False, ignore_misaligned=False
-    ):
-        examples = self.iter_examples(
-            nlp,
-            self.train_annotations,
-            gold_preproc=gold_preproc,
-            ignore_misaligned=ignore_misaligned,
-        )
-        yield from examples
-
-    def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
-        examples = self.iter_examples(
-            nlp,
-            self.dev_annotations,
-            gold_preproc=gold_preproc,
-            ignore_misaligned=ignore_misaligned,
-        )
-        yield from examples
-
-    @classmethod
-    def iter_examples(
-        cls,
-        nlp,
-        annotations,
-        gold_preproc,
-        max_length=None,
-        orth_variant_level=0.0,
-        make_projective=False,
-        ignore_misaligned=False,
-    ):
-        """ Setting gold_preproc will result in creating a doc per sentence """
-        for eg_dict in annotations:
-            token_annot = eg_dict.get("token_annotation", {})
-            if eg_dict["text"]:
-                doc = nlp.make_doc(eg_dict["text"])
-            elif "words" in token_annot:
-                doc = Doc(nlp.vocab, words=token_annot["words"])
-            else:
-                raise ValueError("Expecting either 'text' or token_annotation.words annotation")
-
-            if gold_preproc:
-                variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
-                doc = nlp.make_doc(variant_text)
-                eg_dict["token_annotation"] = variant_token_annot
-                example = Example.from_dict(doc, eg_dict)
-                examples = example.split_sents()
-
-            else:
-                example = Example.from_dict(doc, eg_dict)
-                examples = [example]
-
-            for eg in examples:
-                if (not max_length) or len(eg.predicted) < max_length:
-                    yield eg

From 64d00520e2ee45a8b11446c7df9edd9046dd544a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:21:08 +0200
Subject: [PATCH 37/49] Update imports

---
 spacy/cli/debug_data.py | 4 ++--
 spacy/cli/evaluate.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index c86408170..e0a6cba2e 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -4,7 +4,7 @@ import sys
 import srsly
 from wasabi import Printer, MESSAGES
 
-from ..gold import GoldCorpus
+from ..gold import Corpus
 from ..syntax import nonproj
 from ..util import load_model, get_lang_class
 
@@ -68,7 +68,7 @@ def debug_data(
     loading_train_error_message = ""
     loading_dev_error_message = ""
     with msg.loading("Loading corpus..."):
-        corpus = GoldCorpus(train_path, dev_path)
+        corpus = Corpus(train_path, dev_path)
         try:
             train_dataset = list(corpus.train_dataset(nlp))
             train_dataset_unpreprocessed = list(
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index bae252b1c..09ce7c1b5 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -1,7 +1,7 @@
 from timeit import default_timer as timer
 from wasabi import msg
 
-from ..gold import GoldCorpus
+from ..gold import Corpus
 from .. import util
 from .. import displacy
 
@@ -31,7 +31,7 @@ def evaluate(
         msg.fail("Evaluation data not found", data_path, exits=1)
     if displacy_path and not displacy_path.exists():
         msg.fail("Visualization output directory not found", displacy_path, exits=1)
-    corpus = GoldCorpus(data_path, data_path)
+    corpus = Corpus(data_path, data_path)
     if model.startswith("blank:"):
         nlp = util.get_lang_class(model.replace("blank:", ""))()
     else:

From 4bbc2777584808da383e4b79b98e174fed6563a0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:21:24 +0200
Subject: [PATCH 38/49] Update after removing GoldCorpus

---
 spacy/about.py                           |  2 +-
 spacy/gold/__init__.py                   |  2 +-
 spacy/tests/regression/test_issue4402.py |  4 ++--
 spacy/tests/test_gold.py                 | 12 ++++++------
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index 04a660ad1..14ea60c8c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.0.dev9"
+__version__ = "3.0.0"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py
index 22530a757..c2d237f84 100644
--- a/spacy/gold/__init__.py
+++ b/spacy/gold/__init__.py
@@ -1,4 +1,4 @@
-from .corpus import GoldCorpus
+from .corpus_docbin import Corpus
 from .example import Example
 from .align import align
 
diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py
index 80d37b1e6..71ed7ec14 100644
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@@ -1,5 +1,5 @@
 import srsly
-from spacy.gold import GoldCorpus
+from spacy.gold import Corpus
 from spacy.lang.en import English
 
 from ..util import make_tempdir
@@ -11,7 +11,7 @@ def test_issue4402():
         json_path = tmpdir / "test4402.json"
         srsly.write_json(json_path, json_data)
 
-        corpus = GoldCorpus(str(json_path), str(json_path))
+        corpus = Corpus(str(json_path), str(json_path))
 
         train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
         # assert that the data got split into 4 sentences
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 726492138..7af62accb 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -1,7 +1,7 @@
 from spacy.errors import AlignmentError
 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
 from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
-from spacy.gold import GoldCorpus, docs_to_json
+from spacy.gold import Corpus, docs_to_json
 from spacy.gold.example import Example
 from spacy.lang.en import English
 from spacy.syntax.nonproj import is_nonproj_tree
@@ -299,7 +299,7 @@ def test_roundtrip_docs_to_json(doc):
     with make_tempdir() as tmpdir:
         json_file = tmpdir / "roundtrip.json"
         srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
+        goldcorpus = Corpus(train=str(json_file), dev=str(json_file))
 
         reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
         assert len(doc) == goldcorpus.count_train()
@@ -328,7 +328,7 @@ def test_projective_train_vs_nonprojective_dev(doc):
         json_file = tmpdir / "test.json"
         # write to JSON train dicts
         srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         train_reloaded_example = next(goldcorpus.train_dataset(nlp))
         train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
@@ -360,7 +360,7 @@ def test_ignore_misaligned(doc):
         data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
         # write to JSON train dicts
         srsly.write_json(json_file, data)
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         with pytest.raises(AlignmentError):
             train_reloaded_example = next(goldcorpus.train_dataset(nlp))
@@ -371,7 +371,7 @@ def test_ignore_misaligned(doc):
         data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
         # write to JSON train dicts
         srsly.write_json(json_file, data)
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         # doesn't raise an AlignmentError, but there is nothing to iterate over
         # because the only example can't be aligned
@@ -385,7 +385,7 @@ def test_make_orth_variants(doc):
         json_file = tmpdir / "test.json"
         # write to JSON train dicts
         srsly.write_json(json_file, [docs_to_json(doc)])
-        goldcorpus = GoldCorpus(str(json_file), str(json_file))
+        goldcorpus = Corpus(str(json_file), str(json_file))
 
         # due to randomness, test only that this runs with no errors for now
         train_example = next(goldcorpus.train_dataset(nlp))

From 2791c1c0dc69eeb756d8c69b3c0ddafc288dc00c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:22:14 +0200
Subject: [PATCH 39/49] Fix module name of corpus

---
 spacy/gold/__init__.py                     | 2 +-
 spacy/gold/{corpus_docbin.py => corpus.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename spacy/gold/{corpus_docbin.py => corpus.py} (100%)

diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py
index c2d237f84..9416bdd81 100644
--- a/spacy/gold/__init__.py
+++ b/spacy/gold/__init__.py
@@ -1,4 +1,4 @@
-from .corpus_docbin import Corpus
+from .corpus import Corpus
 from .example import Example
 from .align import align
 
diff --git a/spacy/gold/corpus_docbin.py b/spacy/gold/corpus.py
similarity index 100%
rename from spacy/gold/corpus_docbin.py
rename to spacy/gold/corpus.py

From 914924a68b3dbd2698a2dc7176e7d6f5d8562422 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 22:22:40 +0200
Subject: [PATCH 40/49] Fix mimport

---
 spacy/cli/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 3420c96fa..6a1d74934 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -12,7 +12,7 @@ import thinc.schedules
 from thinc.api import Model, use_pytorch_for_gpu_memory
 import random
 
-from ..gold.corpus_docbin import Corpus
+from ..gold import Corpus
 from ..lookups import Lookups
 from .. import util
 from ..errors import Errors

From c58deb354632bfe417a1821c171ce1d6eeae77a6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:01:09 +0200
Subject: [PATCH 41/49] Work on parser oracle

---
 spacy/syntax/arc_eager.pyx | 129 +++++++++++++++++++++++--------------
 1 file changed, 82 insertions(+), 47 deletions(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index b0fedd6c4..b8baab49a 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -82,7 +82,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp
         gold_i = cand_to_gold[cand_i]
         if gold_i is not None: # Alignment found
             ref_tok = example.y.c[gold_i]
-            gold_head = gold_to_cand[ref_tok.head + gold_i]
+            gold_head = gold_to_cand[gold_i + ref_tok.head]
             if gold_head is not None:
                 gs.heads[cand_i] = gold_head
                 gs.labels[cand_i] = ref_tok.dep
@@ -106,17 +106,17 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp
     stack_words = set()
     for i in range(stcls.stack_depth()):
         s_i = stcls.S(i)
-        head = s_i + gs.heads[s_i]
+        head = gs.heads[s_i]
         gs.n_kids_in_stack[head] += 1
         stack_words.add(s_i)
     buffer_words = set()
     for i in range(stcls.buffer_length()):
         b_i = stcls.B(i)
-        head = b_i + gs.heads[b_i]
+        head = gs.heads[b_i]
         gs.n_kids_in_buffer[head] += 1
         buffer_words.add(b_i)
     for i in range(gs.length):
-        head = i + gs.heads[i]
+        head = gs.heads[i]
         if head in stack_words:
             gs.state_bits[i] = set_state_flag(
                 gs.state_bits[i],
@@ -142,6 +142,58 @@ cdef GoldParseStateC create_gold_state(Pool mem, StateClass stcls, Example examp
     return gs
 
 
+cdef void update_gold_state(GoldParseStateC* gs, StateClass stcls) except *:
+    for i in range(gs.length):
+        gs.state_bits[i] = set_state_flag(
+            gs.state_bits[i],
+            HEAD_IN_BUFFER,
+            0
+        )
+        gs.state_bits[i] = set_state_flag(
+            gs.state_bits[i],
+            HEAD_IN_STACK,
+            0
+        )
+        gs.n_kids_in_stack[i] = 0
+        gs.n_kids_in_buffer[i] = 0
+    stack_words = set()
+    for i in range(stcls.stack_depth()):
+        s_i = stcls.S(i)
+        head = gs.heads[s_i]
+        gs.n_kids_in_stack[head] += 1
+        stack_words.add(s_i)
+    buffer_words = set()
+    for i in range(stcls.buffer_length()):
+        b_i = stcls.B(i)
+        head = gs.heads[b_i]
+        gs.n_kids_in_buffer[head] += 1
+        buffer_words.add(b_i)
+    for i in range(gs.length):
+        head = gs.heads[i]
+        if head in stack_words:
+            gs.state_bits[i] = set_state_flag(
+                gs.state_bits[i],
+                HEAD_IN_STACK,
+                1
+            )
+            gs.state_bits[i] = set_state_flag(
+                gs.state_bits[i],
+                HEAD_IN_BUFFER,
+                0
+            )
+        elif head in buffer_words:
+            gs.state_bits[i] = set_state_flag(
+                gs.state_bits[i],
+                HEAD_IN_STACK,
+                0
+            )
+            gs.state_bits[i] = set_state_flag(
+                gs.state_bits[i],
+                HEAD_IN_BUFFER,
+                1
+            )
+
+
 cdef class ArcEagerGold:
     cdef GoldParseStateC c
     cdef Pool mem
@@ -150,6 +202,9 @@ cdef class ArcEagerGold:
         self.mem = Pool()
         self.c = create_gold_state(self.mem, stcls, example)
 
+    def update(self, StateClass stcls):
+        update_gold_state(&self.c, stcls)
+
 
 
 cdef int check_state_gold(char state_bits, char flag) nogil:
@@ -319,22 +374,27 @@ cdef class LeftArc:
     @staticmethod
     cdef inline weight_t cost(StateClass s, const void* _gold, attr_t label) nogil:
         gold = <const GoldParseStateC*>_gold
-        return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
+        return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
 
     @staticmethod
-    cdef inline weight_t move_cost(StateClass s, const void* _gold) nogil:
-        gold = <const GoldParseStateC*>_gold
-        if arc_is_gold(gold, s.S(0), s.B(0)):
-            return 0
-        elif s.c.shifted[s.B(0)]:
-            return push_cost(s, gold, s.B(0))
+    cdef inline weight_t move_cost(StateClass s, const GoldParseStateC* gold) nogil:
+        cdef weight_t cost = 0
+        s0 = s.S(0)
+        b0 = s.B(0)
+        if arc_is_gold(gold, b0, s0):
+            # Have a negative cost if we 'recover' from the wrong dependency
+            return 0 if not s.has_head(s0) else -1
         else:
-            return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
+            # Account for deps we might lose between S0 and stack
+            if not s.has_head(s0):
+                cost += gold.n_kids_in_stack[s0]
+                if is_head_in_buffer(gold, s0):
+                    cost += 1
+            return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
 
     @staticmethod
-    cdef weight_t label_cost(StateClass s, const void* _gold, attr_t label) nogil:
-        gold = <const GoldParseStateC*>_gold
-        return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
+    cdef inline weight_t label_cost(StateClass s, const GoldParseStateC* gold, attr_t label) nogil:
+        return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
 
 
 cdef class RightArc:
@@ -622,42 +682,17 @@ cdef class ArcEager(TransitionSystem):
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        StateClass stcls, gold) except -1:
-        gold_state = (<ArcEagerGold>gold).c
-        cdef int i, move
-        cdef attr_t label
-        cdef label_cost_func_t[N_MOVES] label_cost_funcs
-        cdef move_cost_func_t[N_MOVES] move_cost_funcs
-        cdef weight_t[N_MOVES] move_costs
-        for i in range(N_MOVES):
-            move_costs[i] = 9000
-        move_cost_funcs[SHIFT] = Shift.move_cost
-        move_cost_funcs[REDUCE] = Reduce.move_cost
-        move_cost_funcs[LEFT] = LeftArc.move_cost
-        move_cost_funcs[RIGHT] = RightArc.move_cost
-        move_cost_funcs[BREAK] = Break.move_cost
-
-        label_cost_funcs[SHIFT] = Shift.label_cost
-        label_cost_funcs[REDUCE] = Reduce.label_cost
-        label_cost_funcs[LEFT] = LeftArc.label_cost
-        label_cost_funcs[RIGHT] = RightArc.label_cost
-        label_cost_funcs[BREAK] = Break.label_cost
-
-        cdef attr_t* labels = gold_state.labels
-        cdef int32_t* heads = gold_state.heads
-
+        if not isinstance(gold, ArcEagerGold):
+            raise TypeError("Expected ArcEagerGold")
+        cdef ArcEagerGold gold_ = gold
+        gold_.update(stcls)
+        gold_state = gold_.c
         n_gold = 0
         for i in range(self.n_moves):
             if self.c[i].is_valid(stcls.c, self.c[i].label):
                 is_valid[i] = True
-                move = self.c[i].move
-                label = self.c[i].label
-                if move_costs[move] == 9000:
-                    move_costs[move] = move_cost_funcs[move](stcls, &gold_state)
-                move_cost = move_costs[move]
-                label_cost = label_cost_funcs[move](stcls, &gold_state, label)
-                costs[i] = move_cost + label_cost
-                n_gold += costs[i] <= 0
-                print(move, label, costs[i])
+                costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
+                n_gold += 1
             else:
                 is_valid[i] = False
                 costs[i] = 9000

From e90341810c3dfac5d912f695fc9e235a6e119120 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:04:02 +0200
Subject: [PATCH 42/49] Update arc_eager oracle

---
 spacy/syntax/arc_eager.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index b8baab49a..13879d898 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -238,7 +238,7 @@ cdef weight_t push_cost(StateClass stcls, const void* _gold, int target) nogil:
     cdef weight_t cost = 0
     if is_head_in_stack(gold, target):
         cost += 1
-    cost += gold.n_kids_in_buffer[target]
+    cost += gold.n_kids_in_stack[target]
     if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
         cost += 1
     return cost

From 318a046fb094d42e4490c05d8a723696f878c30b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:11:08 +0200
Subject: [PATCH 43/49] Restore ArcEager.get_cost function

---
 spacy/syntax/arc_eager.pyx | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 13879d898..c7ecbceea 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -562,9 +562,6 @@ cdef class ArcEager(TransitionSystem):
     def action_types(self):
         return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
 
-    def get_cost(self, StateClass state, Example gold, action):
-        raise NotImplementedError
-
     def transition(self, StateClass state, action):
         cdef Transition t = self.lookup_transition(action)
         t.do(state.c, t.label)
@@ -679,6 +676,18 @@ cdef class ArcEager(TransitionSystem):
                 output[i] = self.c[i].is_valid(st, self.c[i].label)
             else:
                 output[i] = is_valid[self.c[i].move]
+    
+    def get_cost(self, StateClass stcls, gold, int i):
+        if not isinstance(gold, ArcEagerGold):
+            raise TypeError("Expected ArcEagerGold")
+        cdef ArcEagerGold gold_ = gold
+        gold_state = gold_.c
+        n_gold = 0
+        if self.c[i].is_valid(stcls.c, self.c[i].label):
+            cost = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
+        else:
+            cost = 9000
+        return cost
 
     cdef int set_costs(self, int* is_valid, weight_t* costs,
                        StateClass stcls, gold) except -1:

From 7544c21f5bff440e60938a0d33c1d73a30b4918e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:12:05 +0200
Subject: [PATCH 44/49] Update transition system

---
 spacy/syntax/transition_system.pyx | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 319550161..46e438e4c 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -1,4 +1,5 @@
 # cython: infer_types=True
+from __future__ import print_function
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 
@@ -67,11 +68,13 @@ cdef class TransitionSystem:
         costs = <float*>mem.alloc(self.n_moves, sizeof(float))
         is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
 
-        cdef StateClass state = StateClass(example.predicted, offset=0)
-        self.initialize_state(state.c)
+        cdef StateClass state
+        states, golds, n_steps = self.init_gold_batch([example])
+        state = states[0]
+        gold = golds[0]
         history = []
         while not state.is_final():
-            self.set_costs(is_valid, costs, state, example)
+            self.set_costs(is_valid, costs, state, gold)
             for i in range(self.n_moves):
                 if is_valid[i] and costs[i] <= 0:
                     action = self.c[i]

From 9db66ddd4867c0d5db0967193e7adb249460c31d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:12:28 +0200
Subject: [PATCH 45/49] Update test_arc_eager_oracle

---
 spacy/tests/parser/test_arc_eager_oracle.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py
index 39f682a34..c2ab94500 100644
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@@ -13,8 +13,9 @@ from spacy.syntax.arc_eager import ArcEager
 def get_sequence_costs(M, words, heads, deps, transitions):
     doc = Doc(Vocab(), words=words)
     example = Example.from_dict(doc, {"heads": heads, "deps": deps})
-    state = StateClass(doc)
-    M.preprocess_gold(example)
+    states, golds, _ = M.init_gold_batch([example])
+    state = states[0]
+    gold = golds[0]
     cost_history = []
     for gold_action in transitions:
         state_costs = {}
@@ -23,6 +24,7 @@ def get_sequence_costs(M, words, heads, deps, transitions):
             state_costs[name] = M.get_cost(state, gold, i)
         M.transition(state, gold_action)
         cost_history.append(state_costs)
+        gold.update(state)
     return state, cost_history
 
 
@@ -59,7 +61,6 @@ def gold(doc, words):
         raise NotImplementedError
 
 
-@pytest.mark.xfail
 def test_oracle_four_words(arc_eager, vocab):
     words = ["a", "b", "c", "d"]
     heads = [1, 1, 3, 3]
@@ -144,12 +145,11 @@ def test_get_oracle_actions():
     parser.moves.add_action(1, "")
     parser.moves.add_action(1, "")
     parser.moves.add_action(4, "ROOT")
+    heads, deps = projectivize(heads, deps)
     for i, (head, dep) in enumerate(zip(heads, deps)):
         if head > i:
             parser.moves.add_action(2, dep)
         elif head < i:
             parser.moves.add_action(3, dep)
-    heads, deps = projectivize(heads, deps)
     example = Example.from_dict(doc, {"words": words, "tags": tags, "heads": heads, "deps": deps})
-    parser.moves.preprocess_gold(example)
     parser.moves.get_oracle_sequence(example)

From 192b94f0a1a605b7f8239d48921cef1b4365efd0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:15:12 +0200
Subject: [PATCH 46/49] Remove beam test

---
 spacy/tests/parser/test_nn_beam.py | 100 -----------------------------
 1 file changed, 100 deletions(-)
 delete mode 100644 spacy/tests/parser/test_nn_beam.py

diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
deleted file mode 100644
index 30e0264f4..000000000
--- a/spacy/tests/parser/test_nn_beam.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import pytest
-import numpy
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.pipeline.defaults import default_parser
-from spacy.pipeline import DependencyParser
-from spacy.syntax.arc_eager import ArcEager
-from spacy.tokens import Doc
-from spacy.syntax.stateclass import StateClass
-
-
-@pytest.fixture
-def vocab():
-    return Vocab()
-
-
-@pytest.fixture
-def moves(vocab):
-    aeager = ArcEager(vocab.strings, {})
-    aeager.add_action(2, "nsubj")
-    aeager.add_action(3, "dobj")
-    aeager.add_action(2, "aux")
-    return aeager
-
-
-@pytest.fixture
-def docs(vocab):
-    return [Doc(vocab, words=["Rats", "bite", "things"])]
-
-
-@pytest.fixture
-def states(docs):
-    return [StateClass(doc) for doc in docs]
-
-
-@pytest.fixture
-def tokvecs(docs, vector_size):
-    output = []
-    for doc in docs:
-        vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
-        output.append(numpy.asarray(vec))
-    return output
-
-
-@pytest.fixture
-def batch_size(docs):
-    return len(docs)
-
-
-@pytest.fixture
-def beam_width():
-    return 4
-
-
-@pytest.fixture
-def vector_size():
-    return 6
-
-
-@pytest.fixture
-def beam(moves, states, golds, beam_width):
-    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
-
-
-@pytest.fixture
-def scores(moves, batch_size, beam_width):
-    return [
-        numpy.asarray(
-            numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f"
-        )
-        for _ in range(batch_size)
-    ]
-
-
-# All tests below are skipped after removing Beam stuff during the Example/GoldParse refactor
-@pytest.mark.skip
-def test_create_beam(beam):
-    pass
-
-
-@pytest.mark.skip
-def test_beam_advance(beam, scores):
-    beam.advance(scores)
-
-
-@pytest.mark.skip
-def test_beam_advance_too_few_scores(beam, scores):
-    with pytest.raises(IndexError):
-        beam.advance(scores[:-1])
-
-
-@pytest.mark.skip
-def test_beam_parse():
-    nlp = Language()
-    config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0}
-    nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser")
-    nlp.parser.add_label("nsubj")
-    nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
-    doc = nlp.make_doc("Australia is a country")
-    nlp.parser(doc, beam_width=2)

From 2b180ea03343dbc328cb1d81a62b2a719dd512b9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:15:41 +0200
Subject: [PATCH 47/49] Update test

---
 spacy/tests/parser/test_parse.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index 80d91e7ae..0d9e257b9 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -22,7 +22,6 @@ TRAIN_DATA = [
 ]
 
 
-@pytest.mark.skip # Segfault
 def test_parser_root(en_tokenizer):
     text = "i don't have other assistance"
     heads = [3, 2, 1, 0, 1, -2]
@@ -33,9 +32,8 @@ def test_parser_root(en_tokenizer):
         assert t.dep != 0, t.text
 
 
-#@pytest.mark.xfail
+@pytest.mark.xfail
 #@pytest.mark.parametrize("text", ["Hello"])
-@pytest.mark.skip # Segfault
 def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
     tokens = en_tokenizer(text)
     doc = get_doc(
@@ -48,7 +46,6 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
     assert doc[0].dep != 0
 
 
-@pytest.mark.skip # Segfault
 def test_parser_initial(en_tokenizer, en_parser):
     text = "I ate the pizza with anchovies."
     # heads = [1, 0, 1, -2, -3, -1, -5]
@@ -61,7 +58,6 @@ def test_parser_initial(en_tokenizer, en_parser):
     assert tokens[3].head.i == 3
 
 
-@pytest.mark.skip # Segfault
 def test_parser_parse_subtrees(en_tokenizer, en_parser):
     text = "The four wheels on the bus turned quickly"
     heads = [2, 1, 4, -1, 1, -2, 0, -1]
@@ -76,7 +72,6 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
     assert len(list(doc[2].subtree)) == 6
 
 
-@pytest.mark.skip # Segfault
 def test_parser_merge_pp(en_tokenizer):
     text = "A phrase with another phrase occurs"
     heads = [1, 4, -1, 1, -2, 0]
@@ -95,7 +90,6 @@ def test_parser_merge_pp(en_tokenizer):
     assert doc[3].text == "occurs"
 
 
-@pytest.mark.skip # Segfault
 def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
     text = "a b c d e"
 
@@ -170,7 +164,6 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
     assert tokens[4].head.i == 4
 
 
-@pytest.mark.skip # Segfault
 def test_parser_set_sent_starts(en_vocab):
     # fmt: off
     words = ['Ein', 'Satz', '.', 'Außerdem', 'ist', 'Zimmer', 'davon', 'überzeugt', ',', 'dass', 'auch', 'epige-', '\n', 'netische', 'Mechanismen', 'eine', 'Rolle', 'spielen', ',', 'also', 'Vorgänge', ',', 'die', '\n', 'sich', 'darauf', 'auswirken', ',', 'welche', 'Gene', 'abgelesen', 'werden', 'und', '\n', 'welche', 'nicht', '.', '\n']
@@ -187,7 +180,6 @@ def test_parser_set_sent_starts(en_vocab):
         for token in sent:
             assert token.head in sent
 
-@pytest.mark.skip
 def test_overfitting_IO():
     # Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
     nlp = English()

From 90d9f04e0b268dc9dd288e129d65928432b9ddf8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:16:33 +0200
Subject: [PATCH 48/49] Unskip

---
 spacy/tests/parser/test_add_label.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py
index 093d4e266..4afa11963 100644
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@@ -44,8 +44,6 @@ def _train_parser(parser):
     return parser
 
 
-# Segfaulting due to refactor. Need to fix.
-@pytest.mark.skip
 def test_add_label(parser):
     parser = _train_parser(parser)
     parser.add_label("right")
@@ -64,8 +62,6 @@ def test_add_label(parser):
     assert doc[2].dep_ == "left"
 
 
-# segfaulting due to refactor. need to fix.
-@pytest.mark.skip
 def test_add_label_deserializes_correctly():
     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
     ner1 = EntityRecognizer(Vocab(), default_ner(), **config)
@@ -82,8 +78,6 @@ def test_add_label_deserializes_correctly():
     for i in range(ner1.moves.n_moves):
         assert ner1.moves.get_class_name(i) == ner2.moves.get_class_name(i)
 
-# segfaulting due to refactor. need to fix.
-@pytest.mark.skip
 @pytest.mark.parametrize(
     "pipe_cls,n_moves,model",
     [(DependencyParser, 5, default_parser()), (EntityRecognizer, 4, default_ner())],

From 6670c443904e2a29da0cb0096804eb4507d5f2d7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 21 Jun 2020 01:17:52 +0200
Subject: [PATCH 49/49] Unskip tests

---
 spacy/tests/parser/test_preset_sbd.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py
index 9a2e1cfe8..5a29d84f4 100644
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@@ -33,14 +33,12 @@ def parser(vocab):
     return parser
 
 
-@pytest.mark.skip # Segfaults
 def test_no_sentences(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc = parser(doc)
     assert len(list(doc.sents)) >= 1
 
 
-@pytest.mark.skip # Segfaults
 def test_sents_1(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc[2].sent_start = True
@@ -54,7 +52,6 @@ def test_sents_1(parser):
     assert len(list(doc.sents)) == 2
 
 
-@pytest.mark.skip # Segfaults
 def test_sents_1_2(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc[1].sent_start = True
@@ -63,7 +60,6 @@ def test_sents_1_2(parser):
     assert len(list(doc.sents)) >= 3
 
 
-@pytest.mark.skip # Segfaults
 def test_sents_1_3(parser):
     doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
     doc[1].sent_start = True