From 512ba48217df45fd236ebdc83a4f376e4b115345 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 1 Dec 2018 17:37:27 +0100
Subject: [PATCH 01/27] Revert "Allow binary deps when building pex"

This reverts commit 2d0c366101d9989c91e6587edc8afa1eb83c8106.
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index 8180ee9bc..34566d80c 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ dist/spacy.pex : spacy/*.py* spacy/*/*.py*
 	python3.6 -m venv env3.6
 	source env3.6/bin/activate
 	env3.6/bin/pip install wheel
-	env3.6/bin/pip install -r requirements.txt 
+	env3.6/bin/pip install -r requirements.txt --no-cache-dir --no-binary :all:
 	env3.6/bin/python setup.py build_ext --inplace
 	env3.6/bin/python setup.py sdist
 	env3.6/bin/python setup.py bdist_wheel

From b47bd6a27f8a6c2d9e78cb45a1676f0929ebbbff Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 2 Dec 2018 03:57:19 +0100
Subject: [PATCH 02/27] Update thinc version

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cbfae4c0d..3d495277e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 numpy>=1.15.0
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc==7.0.0.dev3
+thinc==7.0.0.dev4
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 cytoolz>=0.9.0,<0.10.0
diff --git a/setup.py b/setup.py
index 363a1914e..05d074f28 100755
--- a/setup.py
+++ b/setup.py
@@ -200,7 +200,7 @@ def setup_package():
                 "murmurhash>=0.28.0,<1.1.0",
                 "cymem>=2.0.2,<2.1.0",
                 "preshed>=2.0.1,<2.1.0",
-                "thinc==7.0.0.dev3",
+                "thinc==7.0.0.dev4",
                 "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
                 "ujson>=1.35",

From db7d2509246d8416b7d39a7b931306bfe4ff9364 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 2 Dec 2018 04:22:23 +0100
Subject: [PATCH 03/27] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 9e6fb4bc8..0c5d0ba59 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
+
 # spaCy: Industrial-strength NLP
 
 spaCy is a library for advanced Natural Language Processing in Python and

From 45798cc53e06b585e2f714f4f4b27a2949fb7717 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 2 Dec 2018 04:26:26 +0100
Subject: [PATCH 04/27] Auto-format examples

---
 examples/deep_learning_keras.py               | 134 ++++++----
 .../entity_relations.py                       |  21 +-
 .../information_extraction/parse_subtrees.py  |  22 +-
 examples/keras_parikh_entailment/__main__.py  |  26 +-
 .../keras_decomposable_attention.py           |  20 +-
 examples/pipeline/custom_attr_methods.py      |  27 +-
 .../custom_component_countries_api.py         |  54 ++--
 .../pipeline/custom_component_entities.py     |  36 +--
 .../pipeline/custom_sentence_segmentation.py  |  17 +-
 examples/pipeline/fix_space_entities.py       |  25 +-
 examples/pipeline/multi_processing.py         |  35 +--
 examples/training/conllu.py                   | 234 +++++++++++-------
 examples/training/ner_multitask_objective.py  |  40 +--
 examples/training/pretrain_textcat.py         |  84 ++++---
 examples/training/train_intent_parser.py      | 126 ++++++----
 examples/training/train_ner.py                |  40 ++-
 examples/training/train_new_entity_type.py    |  70 +++---
 examples/training/train_parser.py             |  47 ++--
 examples/training/train_tagger.py             |  27 +-
 examples/training/train_textcat.py            |  53 ++--
 examples/vectors_fast_text.py                 |  21 +-
 examples/vectors_tensorboard.py               |  55 ++--
 22 files changed, 708 insertions(+), 506 deletions(-)

diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py
index 4d328f96d..131d055e0 100644
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@@ -1,5 +1,12 @@
 """
-This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence
+This example shows how to use an LSTM sentiment classification model trained
+using Keras in spaCy. spaCy splits the document into sentences, and each
+sentence is classified using the LSTM. The scores for the sentences are then
+aggregated to give the document score. This kind of hierarchical model is quite
+difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras
+example on this dataset performs quite poorly, because it cuts off the documents
+so that they're a fixed size. This hurts review accuracy a lot, because people
+often summarise their rating in the final sentence
 
 Prerequisites:
 spacy download en_vectors_web_lg
@@ -25,9 +32,9 @@ import spacy
 class SentimentAnalyser(object):
     @classmethod
     def load(cls, path, nlp, max_length=100):
-        with (path / 'config.json').open() as file_:
+        with (path / "config.json").open() as file_:
             model = model_from_json(file_.read())
-        with (path / 'model').open('rb') as file_:
+        with (path / "model").open("rb") as file_:
             lstm_weights = pickle.load(file_)
         embeddings = get_embeddings(nlp.vocab)
         model.set_weights([embeddings] + lstm_weights)
@@ -69,12 +76,12 @@ def get_labelled_sentences(docs, doc_labels):
         for sent in doc.sents:
             sentences.append(sent)
             labels.append(y)
-    return sentences, numpy.asarray(labels, dtype='int32')
+    return sentences, numpy.asarray(labels, dtype="int32")
 
 
 def get_features(docs, max_length):
     docs = list(docs)
-    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
+    Xs = numpy.zeros((len(docs), max_length), dtype="int32")
     for i, doc in enumerate(docs):
         j = 0
         for token in doc:
@@ -89,16 +96,25 @@ def get_features(docs, max_length):
     return Xs
 
 
-def train(train_texts, train_labels, dev_texts, dev_labels,
-          lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
-          nb_epoch=5, by_sentence=True):
-    
+def train(
+    train_texts,
+    train_labels,
+    dev_texts,
+    dev_labels,
+    lstm_shape,
+    lstm_settings,
+    lstm_optimizer,
+    batch_size=100,
+    nb_epoch=5,
+    by_sentence=True,
+):
+
     print("Loading spaCy")
-    nlp = spacy.load('en_vectors_web_lg')
-    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+    nlp = spacy.load("en_vectors_web_lg")
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
     embeddings = get_embeddings(nlp.vocab)
     model = compile_lstm(embeddings, lstm_shape, lstm_settings)
-    
+
     print("Parsing texts...")
     train_docs = list(nlp.pipe(train_texts))
     dev_docs = list(nlp.pipe(dev_texts))
@@ -106,10 +122,15 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
         train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
         dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
 
-    train_X = get_features(train_docs, lstm_shape['max_length'])
-    dev_X = get_features(dev_docs, lstm_shape['max_length'])
-    model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
-              epochs=nb_epoch, batch_size=batch_size)
+    train_X = get_features(train_docs, lstm_shape["max_length"])
+    dev_X = get_features(dev_docs, lstm_shape["max_length"])
+    model.fit(
+        train_X,
+        train_labels,
+        validation_data=(dev_X, dev_labels),
+        epochs=nb_epoch,
+        batch_size=batch_size,
+    )
     return model
 
 
@@ -119,19 +140,28 @@ def compile_lstm(embeddings, shape, settings):
         Embedding(
             embeddings.shape[0],
             embeddings.shape[1],
-            input_length=shape['max_length'],
+            input_length=shape["max_length"],
             trainable=False,
             weights=[embeddings],
-            mask_zero=True
+            mask_zero=True,
         )
     )
-    model.add(TimeDistributed(Dense(shape['nr_hidden'], use_bias=False)))
-    model.add(Bidirectional(LSTM(shape['nr_hidden'],
-                                 recurrent_dropout=settings['dropout'],
-                                 dropout=settings['dropout'])))
-    model.add(Dense(shape['nr_class'], activation='sigmoid'))
-    model.compile(optimizer=Adam(lr=settings['lr']), loss='binary_crossentropy',
-		  metrics=['accuracy'])
+    model.add(TimeDistributed(Dense(shape["nr_hidden"], use_bias=False)))
+    model.add(
+        Bidirectional(
+            LSTM(
+                shape["nr_hidden"],
+                recurrent_dropout=settings["dropout"],
+                dropout=settings["dropout"],
+            )
+        )
+    )
+    model.add(Dense(shape["nr_class"], activation="sigmoid"))
+    model.compile(
+        optimizer=Adam(lr=settings["lr"]),
+        loss="binary_crossentropy",
+        metrics=["accuracy"],
+    )
     return model
 
 
@@ -140,8 +170,8 @@ def get_embeddings(vocab):
 
 
 def evaluate(model_dir, texts, labels, max_length=100):
-    nlp = spacy.load('en_vectors_web_lg')
-    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+    nlp = spacy.load("en_vectors_web_lg")
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))
     nlp.add_pipe(SentimentAnalyser.load(model_dir, nlp, max_length=max_length))
 
     correct = 0
@@ -154,7 +184,7 @@ def evaluate(model_dir, texts, labels, max_length=100):
 
 def read_data(data_dir, limit=0):
     examples = []
-    for subdir, label in (('pos', 1), ('neg', 0)):
+    for subdir, label in (("pos", 1), ("neg", 0)):
         for filename in (data_dir / subdir).iterdir():
             with filename.open() as file_:
                 text = file_.read()
@@ -162,7 +192,7 @@ def read_data(data_dir, limit=0):
     random.shuffle(examples)
     if limit >= 1:
         examples = examples[:limit]
-    return zip(*examples) # Unzips into two lists
+    return zip(*examples)  # Unzips into two lists
 
 
 @plac.annotations(
@@ -176,13 +206,21 @@ def read_data(data_dir, limit=0):
     learn_rate=("Learn rate", "option", "e", float),
     nb_epoch=("Number of training epochs", "option", "i", int),
     batch_size=("Size of minibatches for training LSTM", "option", "b", int),
-    nr_examples=("Limit to N examples", "option", "n", int)
+    nr_examples=("Limit to N examples", "option", "n", int),
 )
-def main(model_dir=None, train_dir=None, dev_dir=None,
-         is_runtime=False,
-         nr_hidden=64, max_length=100, # Shape
-         dropout=0.5, learn_rate=0.001, # General NN config
-         nb_epoch=5, batch_size=256, nr_examples=-1):  # Training params
+def main(
+    model_dir=None,
+    train_dir=None,
+    dev_dir=None,
+    is_runtime=False,
+    nr_hidden=64,
+    max_length=100,  # Shape
+    dropout=0.5,
+    learn_rate=0.001,  # General NN config
+    nb_epoch=5,
+    batch_size=256,
+    nr_examples=-1,
+):  # Training params
     if model_dir is not None:
         model_dir = pathlib.Path(model_dir)
     if train_dir is None or dev_dir is None:
@@ -204,20 +242,26 @@ def main(model_dir=None, train_dir=None, dev_dir=None,
             dev_texts, dev_labels = zip(*imdb_data[1])
         else:
             dev_texts, dev_labels = read_data(dev_dir, imdb_data, limit=nr_examples)
-        train_labels = numpy.asarray(train_labels, dtype='int32')
-        dev_labels = numpy.asarray(dev_labels, dtype='int32')
-        lstm = train(train_texts, train_labels, dev_texts, dev_labels,
-                     {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 1},
-                     {'dropout': dropout, 'lr': learn_rate},
-                     {},
-                     nb_epoch=nb_epoch, batch_size=batch_size)
+        train_labels = numpy.asarray(train_labels, dtype="int32")
+        dev_labels = numpy.asarray(dev_labels, dtype="int32")
+        lstm = train(
+            train_texts,
+            train_labels,
+            dev_texts,
+            dev_labels,
+            {"nr_hidden": nr_hidden, "max_length": max_length, "nr_class": 1},
+            {"dropout": dropout, "lr": learn_rate},
+            {},
+            nb_epoch=nb_epoch,
+            batch_size=batch_size,
+        )
         weights = lstm.get_weights()
         if model_dir is not None:
-            with (model_dir / 'model').open('wb') as file_:
+            with (model_dir / "model").open("wb") as file_:
                 pickle.dump(weights[1:], file_)
-            with (model_dir / 'config.json').open('w') as file_:
+            with (model_dir / "config.json").open("w") as file_:
                 file_.write(lstm.to_json())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py
index 3c3b8132f..aab5d4f33 100644
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@@ -15,14 +15,15 @@ import spacy
 
 
 TEXTS = [
-    'Net income was $9.4 million compared to the prior year of $2.7 million.',
-    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
+    "Net income was $9.4 million compared to the prior year of $2.7 million.",
+    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
 ]
 
 
 @plac.annotations(
-    model=("Model to load (needs parser and NER)", "positional", None, str))
-def main(model='en_core_web_sm'):
+    model=("Model to load (needs parser and NER)", "positional", None, str)
+)
+def main(model="en_core_web_sm"):
     nlp = spacy.load(model)
     print("Loaded model '%s'" % model)
     print("Processing %d texts" % len(TEXTS))
@@ -31,7 +32,7 @@ def main(model='en_core_web_sm'):
         doc = nlp(text)
         relations = extract_currency_relations(doc)
         for r1, r2 in relations:
-            print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
+            print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
 
 
 def extract_currency_relations(doc):
@@ -41,18 +42,18 @@ def extract_currency_relations(doc):
         span.merge()
 
     relations = []
-    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
-        if money.dep_ in ('attr', 'dobj'):
-            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
+    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
+        if money.dep_ in ("attr", "dobj"):
+            subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
             if subject:
                 subject = subject[0]
                 relations.append((subject, money))
-        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
+        elif money.dep_ == "pobj" and money.head.dep_ == "prep":
             relations.append((money.head.head, money))
     return relations
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py
index ade4627de..55968c9da 100644
--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@@ -24,37 +24,39 @@ import plac
 import spacy
 
 
-@plac.annotations(
-    model=("Model to load", "positional", None, str))
-def main(model='en_core_web_sm'):
+@plac.annotations(model=("Model to load", "positional", None, str))
+def main(model="en_core_web_sm"):
     nlp = spacy.load(model)
     print("Loaded model '%s'" % model)
 
-    doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
-               "understand language")
+    doc = nlp(
+        "displaCy uses CSS and JavaScript to show you how computers "
+        "understand language"
+    )
 
     # The easiest way is to find the head of the subtree you want, and then use
     # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
     # is the one that does what you're asking for most directly:
     for word in doc:
-        if word.dep_ in ('xcomp', 'ccomp'):
-            print(''.join(w.text_with_ws for w in word.subtree))
+        if word.dep_ in ("xcomp", "ccomp"):
+            print("".join(w.text_with_ws for w in word.subtree))
 
     # It'd probably be better for `word.subtree` to return a `Span` object
     # instead of a generator over the tokens. If you want the `Span` you can
     # get it via the `.right_edge` and `.left_edge` properties. The `Span`
     # object is nice because you can easily get a vector, merge it, etc.
     for word in doc:
-        if word.dep_ in ('xcomp', 'ccomp'):
+        if word.dep_ in ("xcomp", "ccomp"):
             subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
-            print(subtree_span.text, '|', subtree_span.root.text)
+            print(subtree_span.text, "|", subtree_span.root.text)
 
     # You might also want to select a head, and then select a start and end
     # position by walking along its children. You could then take the
     # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
     # a span.
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py
index 0b0d93208..7cd66a20c 100644
--- a/examples/keras_parikh_entailment/__main__.py
+++ b/examples/keras_parikh_entailment/__main__.py
@@ -32,7 +32,7 @@ def set_keras_backend(backend):
         K.set_session(K.tf.Session(config=cfg))
         K.clear_session()
 
-set_keras_backend("tensorflow") 
+set_keras_backend("tensorflow")
 
 
 def train(train_loc, dev_loc, shape, settings):
@@ -42,7 +42,7 @@ def train(train_loc, dev_loc, shape, settings):
     print("Loading spaCy")
     nlp = spacy.load('en_vectors_web_lg')
     assert nlp.path is not None
-   
+
     print("Processing texts...")
     train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
     dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
@@ -57,7 +57,7 @@ def train(train_loc, dev_loc, shape, settings):
         validation_data = (dev_X, dev_labels),
         epochs = settings['nr_epoch'],
         batch_size = settings['batch_size'])
-    
+
     if not (nlp.path / 'similarity').exists():
         (nlp.path / 'similarity').mkdir()
     print("Saving to", nlp.path / 'similarity')
@@ -74,7 +74,7 @@ def evaluate(dev_loc, shape):
     dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
     nlp = spacy.load('en_vectors_web_lg')
     nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
-    
+
     total = 0.
     correct = 0.
     for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
@@ -119,33 +119,33 @@ def read_snli(path):
 
 def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
     sents = texts + hypotheses
-    
+
     sents_as_ids = []
     for sent in sents:
         doc = nlp(sent)
         word_ids = []
-        
+
         for i, token in enumerate(doc):
             # skip odd spaces from tokenizer
             if token.has_vector and token.vector_norm == 0:
                 continue
-                
+
             if i > max_length:
                 break
-                
+
             if token.has_vector:
                 word_ids.append(token.rank + num_unk + 1)
             else:
                 # if we don't have a vector, pick an OOV entry
-                word_ids.append(token.rank % num_unk + 1) 
-                
+                word_ids.append(token.rank % num_unk + 1)
+
         # there must be a simpler way of generating padded arrays from lists...
         word_id_vec = np.zeros((max_length), dtype='int')
         clipped_len = min(max_length, len(word_ids))
         word_id_vec[:clipped_len] = word_ids[:clipped_len]
         sents_as_ids.append(word_id_vec)
-        
-        
+
+
     return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
 
 
@@ -169,7 +169,7 @@ def main(mode, train_loc, dev_loc,
         batch_size = 1024,
         nr_epoch = 10,
         entail_dir="both"):
-    
+
     shape = (max_length, nr_hidden, 3)
     settings = {
         'lr': learn_rate,
diff --git a/examples/keras_parikh_entailment/keras_decomposable_attention.py b/examples/keras_parikh_entailment/keras_decomposable_attention.py
index 0cb7bc2b9..c5a9f4660 100644
--- a/examples/keras_parikh_entailment/keras_decomposable_attention.py
+++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py
@@ -10,19 +10,19 @@ def build_model(vectors, shape, settings):
 
     input1 = layers.Input(shape=(max_length,), dtype='int32', name='words1')
     input2 = layers.Input(shape=(max_length,), dtype='int32', name='words2')
-    
+
     # embeddings (projected)
     embed = create_embedding(vectors, max_length, nr_hidden)
-   
+
     a = embed(input1)
     b = embed(input2)
-    
+
     # step 1: attend
     F = create_feedforward(nr_hidden)
     att_weights = layers.dot([F(a), F(b)], axes=-1)
-    
+
     G = create_feedforward(nr_hidden)
-    
+
     if settings['entail_dir'] == 'both':
         norm_weights_a = layers.Lambda(normalizer(1))(att_weights)
         norm_weights_b = layers.Lambda(normalizer(2))(att_weights)
@@ -55,18 +55,18 @@ def build_model(vectors, shape, settings):
         v1 = layers.TimeDistributed(G)(comp1)
         v1_sum = layers.Lambda(sum_word)(v1)
         concat = v1_sum
-    
+
     H = create_feedforward(nr_hidden)
     out = H(concat)
     out = layers.Dense(nr_class, activation='softmax')(out)
-    
+
     model = Model([input1, input2], out)
-    
+
     model.compile(
         optimizer=optimizers.Adam(lr=settings['lr']),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
-    
+
     return model
 
 
@@ -78,7 +78,7 @@ def create_embedding(vectors, max_length, projected_dim):
             input_length=max_length,
             weights=[vectors],
             trainable=False),
-        
+
         layers.TimeDistributed(
             layers.Dense(projected_dim,
                          activation=None,
diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py
index c843d7514..59ce1d0e5 100644
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@@ -19,39 +19,40 @@ from pathlib import Path
 
 
 @plac.annotations(
-    output_dir=("Output directory for saved HTML", "positional", None, Path))
+    output_dir=("Output directory for saved HTML", "positional", None, Path)
+)
 def main(output_dir=None):
     nlp = English()  # start off with blank English class
 
-    Doc.set_extension('overlap', method=overlap_tokens)
-    doc1 = nlp(u"Peach emoji is where it has always been.")
-    doc2 = nlp(u"Peach is the superior emoji.")
+    Doc.set_extension("overlap", method=overlap_tokens)
+    doc1 = nlp("Peach emoji is where it has always been.")
+    doc2 = nlp("Peach is the superior emoji.")
     print("Text 1:", doc1.text)
     print("Text 2:", doc2.text)
     print("Overlapping tokens:", doc1._.overlap(doc2))
 
-    Doc.set_extension('to_html', method=to_html)
-    doc = nlp(u"This is a sentence about Apple.")
+    Doc.set_extension("to_html", method=to_html)
+    doc = nlp("This is a sentence about Apple.")
     # add entity manually for demo purposes, to make it work without a model
-    doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
+    doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings["ORG"])]
     print("Text:", doc.text)
-    doc._.to_html(output=output_dir, style='ent')
+    doc._.to_html(output=output_dir, style="ent")
 
 
-def to_html(doc, output='/tmp', style='dep'):
+def to_html(doc, output="/tmp", style="dep"):
     """Doc method extension for saving the current state as a displaCy
     visualization.
     """
     # generate filename from first six non-punct tokens
-    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
+    file_name = "-".join([w.text for w in doc[:6] if not w.is_punct]) + ".html"
     html = displacy.render(doc, style=style, page=True)  # render markup
     if output is not None:
         output_path = Path(output)
         if not output_path.exists():
             output_path.mkdir()
         output_file = Path(output) / file_name
-        output_file.open('w', encoding='utf-8').write(html)  # save to file
-        print('Saved HTML to {}'.format(output_file))
+        output_file.open("w", encoding="utf-8").write(html)  # save to file
+        print("Saved HTML to {}".format(output_file))
     else:
         print(html)
 
@@ -67,7 +68,7 @@ def overlap_tokens(doc, other_doc):
     return overlap
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py
index 50abfe530..091d331fc 100644
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@@ -25,15 +25,19 @@ def main():
     # and no model or pre-defined pipeline loaded.
     nlp = English()
     rest_countries = RESTCountriesComponent(nlp)  # initialise component
-    nlp.add_pipe(rest_countries) # add it to the pipeline
-    doc = nlp(u"Some text about Colombia and the Czech Republic")
-    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
-    print('Doc has countries', doc._.has_country)  # Doc contains countries
+    nlp.add_pipe(rest_countries)  # add it to the pipeline
+    doc = nlp("Some text about Colombia and the Czech Republic")
+    print("Pipeline", nlp.pipe_names)  # pipeline contains component name
+    print("Doc has countries", doc._.has_country)  # Doc contains countries
     for token in doc:
         if token._.is_country:
-            print(token.text, token._.country_capital, token._.country_latlng,
-                token._.country_flag)  # country data
-    print('Entities', [(e.text, e.label_) for e in doc.ents])  # entities
+            print(
+                token.text,
+                token._.country_capital,
+                token._.country_latlng,
+                token._.country_flag,
+            )  # country data
+    print("Entities", [(e.text, e.label_) for e in doc.ents])  # entities
 
 
 class RESTCountriesComponent(object):
@@ -41,42 +45,42 @@ class RESTCountriesComponent(object):
     the REST Countries API, merges country names into one token, assigns entity
     labels and sets attributes on country tokens.
     """
-    name = 'rest_countries' # component name, will show up in the pipeline
 
-    def __init__(self, nlp, label='GPE'):
+    name = "rest_countries"  # component name, will show up in the pipeline
+
+    def __init__(self, nlp, label="GPE"):
         """Initialise the pipeline component. The shared nlp instance is used
         to initialise the matcher with the shared vocab, get the label ID and
         generate Doc objects as phrase match patterns.
         """
         # Make request once on initialisation and store the data
-        r = requests.get('https://restcountries.eu/rest/v2/all')
+        r = requests.get("https://restcountries.eu/rest/v2/all")
         r.raise_for_status()  # make sure requests raises an error if it fails
         countries = r.json()
 
         # Convert API response to dict keyed by country name for easy lookup
         # This could also be extended using the alternative and foreign language
         # names provided by the API
-        self.countries = {c['name']: c for c in countries}
+        self.countries = {c["name"]: c for c in countries}
         self.label = nlp.vocab.strings[label]  # get entity label ID
 
         # Set up the PhraseMatcher with Doc patterns for each country name
         patterns = [nlp(c) for c in self.countries.keys()]
         self.matcher = PhraseMatcher(nlp.vocab)
-        self.matcher.add('COUNTRIES', None, *patterns)
+        self.matcher.add("COUNTRIES", None, *patterns)
 
         # Register attribute on the Token. We'll be overwriting this based on
         # the matches, so we're only setting a default value, not a getter.
         # If no default value is set, it defaults to None.
-        Token.set_extension('is_country', default=False)
-        Token.set_extension('country_capital', default=False)
-        Token.set_extension('country_latlng', default=False)
-        Token.set_extension('country_flag', default=False)
+        Token.set_extension("is_country", default=False)
+        Token.set_extension("country_capital", default=False)
+        Token.set_extension("country_latlng", default=False)
+        Token.set_extension("country_flag", default=False)
 
         # Register attributes on Doc and Span via a getter that checks if one of
         # the contained tokens is set to is_country == True.
-        Doc.set_extension('has_country', getter=self.has_country)
-        Span.set_extension('has_country', getter=self.has_country)
-
+        Doc.set_extension("has_country", getter=self.has_country)
+        Span.set_extension("has_country", getter=self.has_country)
 
     def __call__(self, doc):
         """Apply the pipeline component on a Doc object and modify it if matches
@@ -93,10 +97,10 @@ class RESTCountriesComponent(object):
             # Can be extended with other data returned by the API, like
             # currencies, country code, flag, calling code etc.
             for token in entity:
-                token._.set('is_country', True)
-                token._.set('country_capital', self.countries[entity.text]['capital'])
-                token._.set('country_latlng', self.countries[entity.text]['latlng'])
-                token._.set('country_flag', self.countries[entity.text]['flag'])
+                token._.set("is_country", True)
+                token._.set("country_capital", self.countries[entity.text]["capital"])
+                token._.set("country_latlng", self.countries[entity.text]["latlng"])
+                token._.set("country_flag", self.countries[entity.text]["flag"])
             # Overwrite doc.ents and add entity – be careful not to replace!
             doc.ents = list(doc.ents) + [entity]
         for span in spans:
@@ -111,10 +115,10 @@ class RESTCountriesComponent(object):
         is a country. Since the getter is only called when we access the
         attribute, we can refer to the Token's 'is_country' attribute here,
         which is already set in the processing step."""
-        return any([t._.get('is_country') for t in tokens])
+        return any([t._.get("is_country") for t in tokens])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py
index f4de4bf6f..c7f48d504 100644
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@@ -20,23 +20,24 @@ from spacy.tokens import Doc, Span, Token
 
 @plac.annotations(
     text=("Text to process", "positional", None, str),
-    companies=("Names of technology companies", "positional", None, str))
+    companies=("Names of technology companies", "positional", None, str),
+)
 def main(text="Alphabet Inc. is the company behind Google.", *companies):
     # For simplicity, we start off with only the blank English Language class
     # and no model or pre-defined pipeline loaded.
     nlp = English()
     if not companies:  # set default companies if none are set via args
-        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
+        companies = ["Alphabet Inc.", "Google", "Netflix", "Apple"]  # etc.
     component = TechCompanyRecognizer(nlp, companies)  # initialise component
     nlp.add_pipe(component, last=True)  # add last to the pipeline
 
     doc = nlp(text)
-    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
-    print('Tokens', [t.text for t in doc])  # company names from the list are merged
-    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
-    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
-    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
-    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
+    print("Pipeline", nlp.pipe_names)  # pipeline contains component name
+    print("Tokens", [t.text for t in doc])  # company names from the list are merged
+    print("Doc has_tech_org", doc._.has_tech_org)  # Doc contains tech orgs
+    print("Token 0 is_tech_org", doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
+    print("Token 1 is_tech_org", doc[1]._.is_tech_org)  # "is" is not
+    print("Entities", [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
 
 
 class TechCompanyRecognizer(object):
@@ -45,9 +46,10 @@ class TechCompanyRecognizer(object):
     labelled as ORG and their spans are merged into one token. Additionally,
     ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
     respectively."""
-    name = 'tech_companies'  # component name, will show up in the pipeline
 
-    def __init__(self, nlp, companies=tuple(), label='ORG'):
+    name = "tech_companies"  # component name, will show up in the pipeline
+
+    def __init__(self, nlp, companies=tuple(), label="ORG"):
         """Initialise the pipeline component. The shared nlp instance is used
         to initialise the matcher with the shared vocab, get the label ID and
         generate Doc objects as phrase match patterns.
@@ -58,16 +60,16 @@ class TechCompanyRecognizer(object):
         # so even if the list of companies is long, it's very efficient
         patterns = [nlp(org) for org in companies]
         self.matcher = PhraseMatcher(nlp.vocab)
-        self.matcher.add('TECH_ORGS', None, *patterns)
+        self.matcher.add("TECH_ORGS", None, *patterns)
 
         # Register attribute on the Token. We'll be overwriting this based on
         # the matches, so we're only setting a default value, not a getter.
-        Token.set_extension('is_tech_org', default=False)
+        Token.set_extension("is_tech_org", default=False)
 
         # Register attributes on Doc and Span via a getter that checks if one of
         # the contained tokens is set to is_tech_org == True.
-        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
-        Span.set_extension('has_tech_org', getter=self.has_tech_org)
+        Doc.set_extension("has_tech_org", getter=self.has_tech_org)
+        Span.set_extension("has_tech_org", getter=self.has_tech_org)
 
     def __call__(self, doc):
         """Apply the pipeline component on a Doc object and modify it if matches
@@ -82,7 +84,7 @@ class TechCompanyRecognizer(object):
             spans.append(entity)
             # Set custom attribute on each token of the entity
             for token in entity:
-                token._.set('is_tech_org', True)
+                token._.set("is_tech_org", True)
             # Overwrite doc.ents and add entity – be careful not to replace!
             doc.ents = list(doc.ents) + [entity]
         for span in spans:
@@ -97,10 +99,10 @@ class TechCompanyRecognizer(object):
         is a tech org. Since the getter is only called when we access the
         attribute, we can refer to the Token's 'is_tech_org' attribute here,
         which is already set in the processing step."""
-        return any([t._.get('is_tech_org') for t in tokens])
+        return any([t._.get("is_tech_org") for t in tokens])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/pipeline/custom_sentence_segmentation.py b/examples/pipeline/custom_sentence_segmentation.py
index d4f8aee63..43ac64979 100644
--- a/examples/pipeline/custom_sentence_segmentation.py
+++ b/examples/pipeline/custom_sentence_segmentation.py
@@ -1,4 +1,4 @@
-'''Example of adding a pipeline component to prohibit sentence boundaries
+"""Example of adding a pipeline component to prohibit sentence boundaries
 before certain tokens.
 
 What we do is write to the token.is_sent_start attribute, which
@@ -10,16 +10,18 @@ should also improve the parse quality.
 The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
 Other versions of the model may not make the original mistake, so the specific
 example might not be apt for future versions.
-'''
+"""
 import plac
 import spacy
 
+
 def prevent_sentence_boundaries(doc):
     for token in doc:
         if not can_be_sentence_start(token):
             token.is_sent_start = False
     return doc
 
+
 def can_be_sentence_start(token):
     if token.i == 0:
         return True
@@ -32,17 +34,18 @@ def can_be_sentence_start(token):
     else:
         return False
 
+
 def main():
-    nlp = spacy.load('en_core_web_lg')
+    nlp = spacy.load("en_core_web_lg")
     raw_text = "Been here and I'm loving it."
     doc = nlp(raw_text)
     sentences = [sent.string.strip() for sent in doc.sents]
     print(sentences)
-    nlp.add_pipe(prevent_sentence_boundaries, before='parser')
+    nlp.add_pipe(prevent_sentence_boundaries, before="parser")
     doc = nlp(raw_text)
     sentences = [sent.string.strip() for sent in doc.sents]
     print(sentences)
- 
-    
-if __name__ == '__main__':
+
+
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/pipeline/fix_space_entities.py b/examples/pipeline/fix_space_entities.py
index 3c3ecc3f2..e3d37ad38 100644
--- a/examples/pipeline/fix_space_entities.py
+++ b/examples/pipeline/fix_space_entities.py
@@ -1,10 +1,11 @@
-'''Demonstrate adding a rule-based component that forces some tokens to not
+"""Demonstrate adding a rule-based component that forces some tokens to not
 be entities, before the NER tagger is applied. This is used to hotfix the issue
 in https://github.com/explosion/spaCy/issues/2870 , present as of spaCy v2.0.16.
-'''
+"""
 import spacy
 from spacy.attrs import ENT_IOB
 
+
 def fix_space_tags(doc):
     ent_iobs = doc.to_array([ENT_IOB])
     for i, token in enumerate(doc):
@@ -14,14 +15,16 @@ def fix_space_tags(doc):
     doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
     return doc
 
-def main():
-    nlp = spacy.load('en_core_web_sm')
-    text = u'''This is some crazy test where I dont need an Apple                Watch to make things bug'''
-    doc = nlp(text)
-    print('Before', doc.ents)
-    nlp.add_pipe(fix_space_tags, name='fix-ner', before='ner')
-    doc = nlp(text)
-    print('After', doc.ents)
 
-if __name__ == '__main__':
+def main():
+    nlp = spacy.load("en_core_web_sm")
+    text = u"""This is some crazy test where I dont need an Apple                Watch to make things bug"""
+    doc = nlp(text)
+    print("Before", doc.ents)
+    nlp.add_pipe(fix_space_tags, name="fix-ner", before="ner")
+    doc = nlp(text)
+    print("After", doc.ents)
+
+
+if __name__ == "__main__":
     main()
diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py
index bf5b8d516..07d3165f9 100644
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@@ -9,6 +9,7 @@ built-in dataset loader.
 Compatible with: spaCy v2.0.0+
 """
 from __future__ import print_function, unicode_literals
+
 from toolz import partition_all
 from pathlib import Path
 from joblib import Parallel, delayed
@@ -22,9 +23,9 @@ import spacy
     model=("Model name (needs tagger)", "positional", None, str),
     n_jobs=("Number of workers", "option", "n", int),
     batch_size=("Batch-size for each process", "option", "b", int),
-    limit=("Limit of entries from the dataset", "option", "l", int))
-def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
-         limit=10000):
+    limit=("Limit of entries from the dataset", "option", "l", int),
+)
+def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
     nlp = spacy.load(model)  # load spaCy model
     print("Loaded model '%s'" % model)
     if not output_dir.exists():
@@ -37,42 +38,44 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
     partitions = partition_all(batch_size, texts)
     executor = Parallel(n_jobs=n_jobs)
     do = delayed(transform_texts)
-    tasks = (do(nlp, i, batch, output_dir)
-             for i, batch in enumerate(partitions))
+    tasks = (do(nlp, i, batch, output_dir) for i, batch in enumerate(partitions))
     executor(tasks)
 
 
 def transform_texts(nlp, batch_id, texts, output_dir):
     print(nlp.pipe_names)
-    out_path = Path(output_dir) / ('%d.txt' % batch_id)
+    out_path = Path(output_dir) / ("%d.txt" % batch_id)
     if out_path.exists():  # return None in case same batch is called again
         return None
-    print('Processing batch', batch_id)
-    with out_path.open('w', encoding='utf8') as f:
+    print("Processing batch", batch_id)
+    with out_path.open("w", encoding="utf8") as f:
         for doc in nlp.pipe(texts):
-            f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
-            f.write('\n')
-    print('Saved {} texts to {}.txt'.format(len(texts), batch_id))
+            f.write(" ".join(represent_word(w) for w in doc if not w.is_space))
+            f.write("\n")
+    print("Saved {} texts to {}.txt".format(len(texts), batch_id))
 
 
 def represent_word(word):
     text = word.text
     # True-case, i.e. try to normalize sentence-initial capitals.
     # Only do this if the lower-cased form is more probable.
-    if text.istitle() and is_sent_begin(word) \
-       and word.prob < word.doc.vocab[text.lower()].prob:
+    if (
+        text.istitle()
+        and is_sent_begin(word)
+        and word.prob < word.doc.vocab[text.lower()].prob
+    ):
         text = text.lower()
-    return text + '|' + word.tag_
+    return text + "|" + word.tag_
 
 
 def is_sent_begin(word):
     if word.i == 0:
         return True
-    elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
+    elif word.i >= 2 and word.nbor(-1).text in (".", "!", "?", "..."):
         return True
     else:
         return False
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/training/conllu.py b/examples/training/conllu.py
index 45c55a1e8..f949a8156 100644
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@@ -1,6 +1,6 @@
-'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
+"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
 .conllu format for development data, allowing the official scorer to be used.
-'''
+"""
 from __future__ import unicode_literals
 import plac
 import tqdm
@@ -35,6 +35,7 @@ spacy.lang.ja.Japanese.Defaults.use_janome = False
 random.seed(0)
 numpy.random.seed(0)
 
+
 def minibatch_by_words(items, size=5000):
     random.shuffle(items)
     if isinstance(size, int):
@@ -59,21 +60,31 @@ def minibatch_by_words(items, size=5000):
         else:
             break
 
+
 ################
 # Data reading #
 ################
 
-space_re = re.compile('\s+')
-def split_text(text):
-    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
- 
+space_re = re.compile("\s+")
 
-def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
-              max_doc_length=None, limit=None):
-    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+
+def split_text(text):
+    return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
+
+
+def read_data(
+    nlp,
+    conllu_file,
+    text_file,
+    raw_text=True,
+    oracle_segments=False,
+    max_doc_length=None,
+    limit=None,
+):
+    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
     include Doc objects created using nlp.make_doc and then aligned against
     the gold-standard sequences. If oracle_segments=True, include Doc objects
-    created from the gold-standard segments. At least one must be True.'''
+    created from the gold-standard segments. At least one must be True."""
     if not raw_text and not oracle_segments:
         raise ValueError("At least one of raw_text or oracle_segments must be True")
     paragraphs = split_text(text_file.read())
@@ -87,22 +98,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
         for cs in cd:
             sent = defaultdict(list)
             for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
-                if '.' in id_:
+                if "." in id_:
                     continue
-                if '-' in id_:
+                if "-" in id_:
                     continue
-                id_ = int(id_)-1
-                head = int(head)-1 if head != '0' else id_
-                sent['words'].append(word)
-                sent['tags'].append(tag)
-                sent['heads'].append(head)
-                sent['deps'].append('ROOT' if dep == 'root' else dep)
-                sent['spaces'].append(space_after == '_')
-            sent['entities'] = ['-'] * len(sent['words'])
-            sent['heads'], sent['deps'] = projectivize(sent['heads'],
-                                                       sent['deps'])
+                id_ = int(id_) - 1
+                head = int(head) - 1 if head != "0" else id_
+                sent["words"].append(word)
+                sent["tags"].append(tag)
+                sent["heads"].append(head)
+                sent["deps"].append("ROOT" if dep == "root" else dep)
+                sent["spaces"].append(space_after == "_")
+            sent["entities"] = ["-"] * len(sent["words"])
+            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
             if oracle_segments:
-                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
+                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
                 golds.append(GoldParse(docs[-1], **sent))
 
             sent_annots.append(sent)
@@ -128,18 +138,18 @@ def read_conllu(file_):
     sent = []
     doc = []
     for line in file_:
-        if line.startswith('# newdoc'):
+        if line.startswith("# newdoc"):
             if doc:
                 docs.append(doc)
             doc = []
-        elif line.startswith('#'):
+        elif line.startswith("#"):
             continue
         elif not line.strip():
             if sent:
                 doc.append(sent)
             sent = []
         else:
-            sent.append(list(line.strip().split('\t')))
+            sent.append(list(line.strip().split("\t")))
             if len(sent[-1]) != 10:
                 print(repr(line))
                 raise ValueError
@@ -154,25 +164,29 @@ def _make_gold(nlp, text, sent_annots):
     # Flatten the conll annotations, and adjust the head indices
     flat = defaultdict(list)
     for sent in sent_annots:
-        flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
-        for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
+        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
+        for field in ["words", "tags", "deps", "entities", "spaces"]:
             flat[field].extend(sent[field])
     # Construct text if necessary
-    assert len(flat['words']) == len(flat['spaces'])
+    assert len(flat["words"]) == len(flat["spaces"])
     if text is None:
-        text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) 
+        text = "".join(
+            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+        )
     doc = nlp.make_doc(text)
-    flat.pop('spaces')
+    flat.pop("spaces")
     gold = GoldParse(doc, **flat)
     return doc, gold
 
+
 #############################
 # Data transforms for spaCy #
 #############################
 
+
 def golds_to_gold_tuples(docs, golds):
-    '''Get out the annoying 'tuples' format used by begin_training, given the
-    GoldParse objects.'''
+    """Get out the annoying 'tuples' format used by begin_training, given the
+    GoldParse objects."""
     tuples = []
     for doc, gold in zip(docs, golds):
         text = doc.text
@@ -186,15 +200,16 @@ def golds_to_gold_tuples(docs, golds):
 # Evaluation #
 ##############
 
+
 def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
-    with text_loc.open('r', encoding='utf8') as text_file:
+    with text_loc.open("r", encoding="utf8") as text_file:
         texts = split_text(text_file.read())
         docs = list(nlp.pipe(texts))
-    with sys_loc.open('w', encoding='utf8') as out_file:
+    with sys_loc.open("w", encoding="utf8") as out_file:
         write_conllu(docs, out_file)
-    with gold_loc.open('r', encoding='utf8') as gold_file:
+    with gold_loc.open("r", encoding="utf8") as gold_file:
         gold_ud = conll17_ud_eval.load_conllu(gold_file)
-        with sys_loc.open('r', encoding='utf8') as sys_file:
+        with sys_loc.open("r", encoding="utf8") as sys_file:
             sys_ud = conll17_ud_eval.load_conllu(sys_file)
         scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
     return scores
@@ -202,10 +217,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
 
 def write_conllu(docs, file_):
     merger = Matcher(docs[0].vocab)
-    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
+    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
     for i, doc in enumerate(docs):
         matches = merger(doc)
-        spans = [doc[start:end+1] for _, start, end in matches]
+        spans = [doc[start : end + 1] for _, start, end in matches]
         offsets = [(span.start_char, span.end_char) for span in spans]
         for start_char, end_char in offsets:
             doc.merge(start_char, end_char)
@@ -214,58 +229,73 @@ def write_conllu(docs, file_):
             file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
             file_.write("# text = {text}\n".format(text=sent.text))
             for k, token in enumerate(sent):
-                file_.write(token._.get_conllu_lines(k) + '\n')
-            file_.write('\n')
+                file_.write(token._.get_conllu_lines(k) + "\n")
+            file_.write("\n")
 
 
 def print_progress(itn, losses, ud_scores):
     fields = {
-        'dep_loss': losses.get('parser', 0.0),
-        'tag_loss': losses.get('tagger', 0.0),
-        'words': ud_scores['Words'].f1 * 100,
-        'sents': ud_scores['Sentences'].f1 * 100,
-        'tags': ud_scores['XPOS'].f1 * 100,
-        'uas': ud_scores['UAS'].f1 * 100,
-        'las': ud_scores['LAS'].f1 * 100,
+        "dep_loss": losses.get("parser", 0.0),
+        "tag_loss": losses.get("tagger", 0.0),
+        "words": ud_scores["Words"].f1 * 100,
+        "sents": ud_scores["Sentences"].f1 * 100,
+        "tags": ud_scores["XPOS"].f1 * 100,
+        "uas": ud_scores["UAS"].f1 * 100,
+        "las": ud_scores["LAS"].f1 * 100,
     }
-    header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
+    header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
     if itn == 0:
-        print('\t'.join(header))
-    tpl = '\t'.join((
-        '{:d}',
-        '{dep_loss:.1f}',
-        '{las:.1f}',
-        '{uas:.1f}',
-        '{tags:.1f}',
-        '{sents:.1f}',
-        '{words:.1f}',
-    ))
+        print("\t".join(header))
+    tpl = "\t".join(
+        (
+            "{:d}",
+            "{dep_loss:.1f}",
+            "{las:.1f}",
+            "{uas:.1f}",
+            "{tags:.1f}",
+            "{sents:.1f}",
+            "{words:.1f}",
+        )
+    )
     print(tpl.format(itn, **fields))
 
-#def get_sent_conllu(sent, sent_id):
+
+# def get_sent_conllu(sent, sent_id):
 #    lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
 
+
 def get_token_conllu(token, i):
     if token._.begins_fused:
         n = 1
         while token.nbor(n)._.inside_fused:
             n += 1
-        id_ = '%d-%d' % (i, i+n)
-        lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
+        id_ = "%d-%d" % (i, i + n)
+        lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
     else:
         lines = []
     if token.head.i == token.i:
         head = 0
     else:
         head = i + (token.head.i - token.i) + 1
-    fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
-              str(head), token.dep_.lower(), '_', '_']
-    lines.append('\t'.join(fields))
-    return '\n'.join(lines)
+    fields = [
+        str(i + 1),
+        token.text,
+        token.lemma_,
+        token.pos_,
+        token.tag_,
+        "_",
+        str(head),
+        token.dep_.lower(),
+        "_",
+        "_",
+    ]
+    lines.append("\t".join(fields))
+    return "\n".join(lines)
 
-Token.set_extension('get_conllu_lines', method=get_token_conllu)
-Token.set_extension('begins_fused', default=False)
-Token.set_extension('inside_fused', default=False)
+
+Token.set_extension("get_conllu_lines", method=get_token_conllu)
+Token.set_extension("begins_fused", default=False)
+Token.set_extension("inside_fused", default=False)
 
 
 ##################
@@ -274,31 +304,32 @@ Token.set_extension('inside_fused', default=False)
 
 
 def load_nlp(corpus, config):
-    lang = corpus.split('_')[0]
+    lang = corpus.split("_")[0]
     nlp = spacy.blank(lang)
     if config.vectors:
-        nlp.vocab.from_disk(config.vectors / 'vocab')
+        nlp.vocab.from_disk(config.vectors / "vocab")
     return nlp
 
+
 def initialize_pipeline(nlp, docs, golds, config):
-    nlp.add_pipe(nlp.create_pipe('parser'))
+    nlp.add_pipe(nlp.create_pipe("parser"))
     if config.multitask_tag:
-        nlp.parser.add_multitask_objective('tag')
+        nlp.parser.add_multitask_objective("tag")
     if config.multitask_sent:
-        nlp.parser.add_multitask_objective('sent_start')
-    nlp.parser.moves.add_action(2, 'subtok')
-    nlp.add_pipe(nlp.create_pipe('tagger'))
+        nlp.parser.add_multitask_objective("sent_start")
+    nlp.parser.moves.add_action(2, "subtok")
+    nlp.add_pipe(nlp.create_pipe("tagger"))
     for gold in golds:
         for tag in gold.tags:
             if tag is not None:
                 nlp.tagger.add_label(tag)
     # Replace labels that didn't make the frequency cutoff
     actions = set(nlp.parser.labels)
-    label_set = set([act.split('-')[1] for act in actions if '-' in act])
+    label_set = set([act.split("-")[1] for act in actions if "-" in act])
     for gold in golds:
         for i, label in enumerate(gold.labels):
             if label is not None and label not in label_set:
-                gold.labels[i] = label.split('||')[0]
+                gold.labels[i] = label.split("||")[0]
     return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
 
 
@@ -306,6 +337,7 @@ def initialize_pipeline(nlp, docs, golds, config):
 # Command line helpers #
 ########################
 
+
 @attr.s
 class Config(object):
     vectors = attr.ib(default=None)
@@ -318,7 +350,7 @@ class Config(object):
 
     @classmethod
     def load(cls, loc):
-        with Path(loc).open('r', encoding='utf8') as file_:
+        with Path(loc).open("r", encoding="utf8") as file_:
             cfg = json.load(file_)
         return cls(**cfg)
 
@@ -331,32 +363,36 @@ class Dataset(object):
         self.text = None
         for file_path in self.path.iterdir():
             name = file_path.parts[-1]
-            if section in name and name.endswith('conllu'):
+            if section in name and name.endswith("conllu"):
                 self.conllu = file_path
-            elif section in name and name.endswith('txt'):
+            elif section in name and name.endswith("txt"):
                 self.text = file_path
         if self.conllu is None:
             msg = "Could not find .txt file in {path} for {section}"
             raise IOError(msg.format(section=section, path=path))
         if self.text is None:
             msg = "Could not find .txt file in {path} for {section}"
-        self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
+        self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
 
 
 class TreebankPaths(object):
     def __init__(self, ud_path, treebank, **cfg):
-        self.train = Dataset(ud_path / treebank, 'train')
-        self.dev = Dataset(ud_path / treebank, 'dev')
+        self.train = Dataset(ud_path / treebank, "train")
+        self.dev = Dataset(ud_path / treebank, "dev")
         self.lang = self.train.lang
 
 
 @plac.annotations(
     ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
-    corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
-            "positional", None, str),
+    corpus=(
+        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "positional",
+        None,
+        str,
+    ),
     parses_dir=("Directory to write the development parses", "positional", None, Path),
     config=("Path to json formatted config file", "positional", None, Config.load),
-    limit=("Size limit", "option", "n", int)
+    limit=("Size limit", "option", "n", int),
 )
 def main(ud_dir, parses_dir, config, corpus, limit=0):
     paths = TreebankPaths(ud_dir, corpus)
@@ -365,8 +401,13 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
     print("Train and evaluate", corpus, "using lang", paths.lang)
     nlp = load_nlp(paths.lang, config)
 
-    docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
-                            max_doc_length=config.max_doc_length, limit=limit)
+    docs, golds = read_data(
+        nlp,
+        paths.train.conllu.open(),
+        paths.train.text.open(),
+        max_doc_length=config.max_doc_length,
+        limit=limit,
+    )
 
     optimizer = initialize_pipeline(nlp, docs, golds, config)
 
@@ -379,14 +420,19 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
             for batch in batches:
                 batch_docs, batch_gold = zip(*batch)
                 pbar.update(sum(len(doc) for doc in batch_docs))
-                nlp.update(batch_docs, batch_gold, sgd=optimizer,
-                           drop=config.dropout, losses=losses)
-        
-        out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
+                nlp.update(
+                    batch_docs,
+                    batch_gold,
+                    sgd=optimizer,
+                    drop=config.dropout,
+                    losses=losses,
+                )
+
+        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
         with nlp.use_params(optimizer.averages):
             scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
             print_progress(i, losses, scores)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py
index 24f700324..5d44ed649 100644
--- a/examples/training/ner_multitask_objective.py
+++ b/examples/training/ner_multitask_objective.py
@@ -1,4 +1,4 @@
-'''This example shows how to add a multi-task objective that is trained
+"""This example shows how to add a multi-task objective that is trained
 alongside the entity recognizer. This is an alternative to adding features
 to the model.
 
@@ -19,7 +19,7 @@ The specific example here is not necessarily a good idea --- but it shows
 how an arbitrary objective function for some word can be used.
 
 Developed and tested for spaCy 2.0.6
-'''
+"""
 import random
 import plac
 import spacy
@@ -30,30 +30,29 @@ random.seed(0)
 
 PWD = os.path.dirname(__file__)
 
-TRAIN_DATA = list(read_json_file(os.path.join(PWD, 'training-data.json')))
-
+TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
 
 
 def get_position_label(i, words, tags, heads, labels, ents):
-    '''Return labels indicating the position of the word in the document.
-    '''
+    """Return labels indicating the position of the word in the document.
+    """
     if len(words) < 20:
-        return 'short-doc'
+        return "short-doc"
     elif i == 0:
-        return 'first-word'
+        return "first-word"
     elif i < 10:
-        return 'early-word'
+        return "early-word"
     elif i < 20:
-        return 'mid-word'
-    elif i == len(words)-1:
-        return 'last-word'
+        return "mid-word"
+    elif i == len(words) - 1:
+        return "last-word"
     else:
-        return 'late-word'
+        return "late-word"
 
 
 def main(n_iter=10):
-    nlp = spacy.blank('en')
-    ner = nlp.create_pipe('ner')
+    nlp = spacy.blank("en")
+    ner = nlp.create_pipe("ner")
     ner.add_multitask_objective(get_position_label)
     nlp.add_pipe(ner)
 
@@ -71,15 +70,16 @@ def main(n_iter=10):
                 [gold],  # batch of annotations
                 drop=0.2,  # dropout - make it harder to memorise data
                 sgd=optimizer,  # callable to update weights
-                losses=losses)
-        print(losses.get('nn_labeller', 0.0), losses['ner'])
+                losses=losses,
+            )
+        print(losses.get("nn_labeller", 0.0), losses["ner"])
 
     # test the trained model
     for text, _ in TRAIN_DATA:
         doc = nlp(text)
-        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
-        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
+        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
+        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py
index 0f551edc7..49dd28060 100644
--- a/examples/training/pretrain_textcat.py
+++ b/examples/training/pretrain_textcat.py
@@ -1,4 +1,4 @@
-'''This script is experimental.
+"""This script is experimental.
 
 Try pre-training the CNN component of the text categorizer using a cheap
 language modelling-like objective. Specifically, we load pre-trained vectors
@@ -12,7 +12,7 @@ To evaluate the technique, we're pre-training with the 50k texts from the IMDB
 corpus, and then training with only 100 labels. Note that it's a bit dirty to
 pre-train with the development data, but also not *so* terrible: we're not using
 the development labels, after all --- only the unlabelled text.
-'''
+"""
 import plac
 import random
 import spacy
@@ -46,8 +46,8 @@ def load_textcat_data(limit=0):
     train_data = train_data[-limit:]
     texts, labels = zip(*train_data)
     eval_texts, eval_labels = zip(*eval_data)
-    cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in labels]
-    eval_cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in eval_labels]
+    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
+    eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
     return (texts, cats), (eval_texts, eval_cats)
 
 
@@ -57,6 +57,7 @@ def prefer_gpu():
         return False
     else:
         import cupy.random
+
         cupy.random.seed(0)
         return True
 
@@ -68,7 +69,7 @@ def build_textcat_model(tok2vec, nr_class, width):
     from thinc.misc import Residual, LayerNorm
     from spacy._ml import logistic, zero_init
 
-    with Model.define_operators({'>>': chain}):
+    with Model.define_operators({">>": chain}):
         model = (
             tok2vec
             >> flatten_add_lengths
@@ -78,27 +79,35 @@ def build_textcat_model(tok2vec, nr_class, width):
     model.tok2vec = tok2vec
     return model
 
+
 def block_gradients(model):
     from thinc.api import wrap
-    def forward(X, drop=0.):
+
+    def forward(X, drop=0.0):
         Y, _ = model.begin_update(X, drop=drop)
         return Y, None
+
     return wrap(forward, model)
 
+
 def create_pipeline(width, embed_size, vectors_model):
     print("Load vectors")
     nlp = spacy.load(vectors_model)
     print("Start training")
-    textcat = TextCategorizer(nlp.vocab, 
-        labels=['POSITIVE', 'NEGATIVE'],
+    textcat = TextCategorizer(
+        nlp.vocab,
+        labels=["POSITIVE", "NEGATIVE"],
         model=build_textcat_model(
-            Tok2Vec(width=width, embed_size=embed_size), 2, width))
+            Tok2Vec(width=width, embed_size=embed_size), 2, width
+        ),
+    )
 
     nlp.add_pipe(textcat)
     return nlp
 
+
 def train_tensorizer(nlp, texts, dropout, n_iter):
-    tensorizer = nlp.create_pipe('tensorizer')
+    tensorizer = nlp.create_pipe("tensorizer")
     nlp.add_pipe(tensorizer)
     optimizer = nlp.begin_training()
     for i in range(n_iter):
@@ -109,36 +118,43 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
         print(losses)
     return optimizer
 
+
 def train_textcat(nlp, n_texts, n_iter=10):
-    textcat = nlp.get_pipe('textcat')
+    textcat = nlp.get_pipe("textcat")
     tok2vec_weights = textcat.model.tok2vec.to_bytes()
     (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
-    print("Using {} examples ({} training, {} evaluation)"
-          .format(n_texts, len(train_texts), len(dev_texts)))
-    train_data = list(zip(train_texts,
-                          [{'cats': cats} for cats in train_cats]))
+    print(
+        "Using {} examples ({} training, {} evaluation)".format(
+            n_texts, len(train_texts), len(dev_texts)
+        )
+    )
+    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
 
     # get names of other pipes to disable them during training
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
     with nlp.disable_pipes(*other_pipes):  # only train textcat
         optimizer = nlp.begin_training()
         textcat.model.tok2vec.from_bytes(tok2vec_weights)
         print("Training the model...")
-        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
+        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
         for i in range(n_iter):
-            losses = {'textcat': 0.0}
+            losses = {"textcat": 0.0}
             # batch up the examples using spaCy's minibatch
             batches = minibatch(tqdm.tqdm(train_data), size=2)
             for batch in batches:
                 texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
-                           losses=losses)
+                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
             with textcat.model.use_params(optimizer.averages):
                 # evaluate on the dev data split off in load_data()
                 scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
-            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
-                  .format(losses['textcat'], scores['textcat_p'],
-                          scores['textcat_r'], scores['textcat_f']))
+            print(
+                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
+                    losses["textcat"],
+                    scores["textcat_p"],
+                    scores["textcat_r"],
+                    scores["textcat_f"],
+                )
+            )
 
 
 def evaluate_textcat(tokenizer, textcat, texts, cats):
@@ -153,9 +169,9 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
             if label not in gold:
                 continue
             if score >= 0.5 and gold[label] >= 0.5:
-                tp += 1.
+                tp += 1.0
             elif score >= 0.5 and gold[label] < 0.5:
-                fp += 1.
+                fp += 1.0
             elif score < 0.5 and gold[label] < 0.5:
                 tn += 1
             elif score < 0.5 and gold[label] >= 0.5:
@@ -163,8 +179,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
     precision = tp / (tp + fp)
     recall = tp / (tp + fn)
     f_score = 2 * (precision * recall) / (precision + recall)
-    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
-
+    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
 
 
 @plac.annotations(
@@ -173,10 +188,16 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
     pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
     train_iters=("Number of iterations to pretrain", "option", "tn", int),
     train_examples=("Number of labelled examples", "option", "eg", int),
-    vectors_model=("Name or path to vectors model to learn from")
+    vectors_model=("Name or path to vectors model to learn from"),
 )
-def main(width, embed_size, vectors_model,
-        pretrain_iters=30, train_iters=30, train_examples=1000):
+def main(
+    width,
+    embed_size,
+    vectors_model,
+    pretrain_iters=30,
+    train_iters=30,
+    train_examples=1000,
+):
     random.seed(0)
     numpy.random.seed(0)
     use_gpu = prefer_gpu()
@@ -190,5 +211,6 @@ def main(width, embed_size, vectors_model,
     print("Train textcat")
     train_textcat(nlp, train_examples, n_iter=train_iters)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/training/train_intent_parser.py b/examples/training/train_intent_parser.py
index 7c337baff..08d06bd4c 100644
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@@ -29,73 +29,113 @@ from spacy.util import minibatch, compounding
 # training data: texts, heads and dependency labels
 # for no relation, we simply chose an arbitrary dependency label, e.g. '-'
 TRAIN_DATA = [
-    ("find a cafe with great wifi", {
-        'heads': [0, 2, 0, 5, 5, 2],  # index of token head
-        'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
-    }),
-    ("find a hotel near the beach", {
-        'heads': [0, 2, 0, 5, 5, 2],
-        'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
-    }),
-    ("find me the closest gym that's open late", {
-        'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
-        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
-    }),
-    ("show me the cheapest store that sells flowers", {
-        'heads': [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
-        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
-    }),
-    ("find a nice restaurant in london", {
-        'heads': [0, 3, 3, 0, 3, 3],
-        'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
-    }),
-    ("show me the coolest hostel in berlin", {
-        'heads': [0, 0, 4, 4, 0, 4, 4],
-        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
-    }),
-    ("find a good italian restaurant near work", {
-        'heads': [0, 4, 4, 4, 0, 4, 5],
-        'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
-    })
+    (
+        "find a cafe with great wifi",
+        {
+            "heads": [0, 2, 0, 5, 5, 2],  # index of token head
+            "deps": ["ROOT", "-", "PLACE", "-", "QUALITY", "ATTRIBUTE"],
+        },
+    ),
+    (
+        "find a hotel near the beach",
+        {
+            "heads": [0, 2, 0, 5, 5, 2],
+            "deps": ["ROOT", "-", "PLACE", "QUALITY", "-", "ATTRIBUTE"],
+        },
+    ),
+    (
+        "find me the closest gym that's open late",
+        {
+            "heads": [0, 0, 4, 4, 0, 6, 4, 6, 6],
+            "deps": [
+                "ROOT",
+                "-",
+                "-",
+                "QUALITY",
+                "PLACE",
+                "-",
+                "-",
+                "ATTRIBUTE",
+                "TIME",
+            ],
+        },
+    ),
+    (
+        "show me the cheapest store that sells flowers",
+        {
+            "heads": [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
+            "deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "-", "PRODUCT"],
+        },
+    ),
+    (
+        "find a nice restaurant in london",
+        {
+            "heads": [0, 3, 3, 0, 3, 3],
+            "deps": ["ROOT", "-", "QUALITY", "PLACE", "-", "LOCATION"],
+        },
+    ),
+    (
+        "show me the coolest hostel in berlin",
+        {
+            "heads": [0, 0, 4, 4, 0, 4, 4],
+            "deps": ["ROOT", "-", "-", "QUALITY", "PLACE", "-", "LOCATION"],
+        },
+    ),
+    (
+        "find a good italian restaurant near work",
+        {
+            "heads": [0, 4, 4, 4, 0, 4, 5],
+            "deps": [
+                "ROOT",
+                "-",
+                "QUALITY",
+                "ATTRIBUTE",
+                "PLACE",
+                "ATTRIBUTE",
+                "LOCATION",
+            ],
+        },
+    ),
 ]
 
 
 @plac.annotations(
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
-    n_iter=("Number of training iterations", "option", "n", int))
+    n_iter=("Number of training iterations", "option", "n", int),
+)
 def main(model=None, output_dir=None, n_iter=15):
     """Load the model, set up the pipeline and train the parser."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
         print("Loaded model '%s'" % model)
     else:
-        nlp = spacy.blank('en')  # create blank Language class
+        nlp = spacy.blank("en")  # create blank Language class
         print("Created blank 'en' model")
 
     # We'll use the built-in dependency parser class, but we want to create a
     # fresh instance – just in case.
-    if 'parser' in nlp.pipe_names:
-        nlp.remove_pipe('parser')
-    parser = nlp.create_pipe('parser')
+    if "parser" in nlp.pipe_names:
+        nlp.remove_pipe("parser")
+    parser = nlp.create_pipe("parser")
     nlp.add_pipe(parser, first=True)
 
     for text, annotations in TRAIN_DATA:
-        for dep in annotations.get('deps', []):
+        for dep in annotations.get("deps", []):
             parser.add_label(dep)
 
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
     with nlp.disable_pipes(*other_pipes):  # only train parser
         optimizer = nlp.begin_training()
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
             # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
             for batch in batches:
                 texts, annotations = zip(*batch)
                 nlp.update(texts, annotations, sgd=optimizer, losses=losses)
-            print('Losses', losses)
+            print("Losses", losses)
 
     # test the trained model
     test_model(nlp)
@@ -115,16 +155,18 @@ def main(model=None, output_dir=None, n_iter=15):
 
 
 def test_model(nlp):
-    texts = ["find a hotel with good wifi",
-             "find me the cheapest gym near work",
-             "show me the best hotel in berlin"]
+    texts = [
+        "find a hotel with good wifi",
+        "find me the cheapest gym near work",
+        "show me the best hotel in berlin",
+    ]
     docs = nlp.pipe(texts)
     for doc in docs:
         print(doc.text)
-        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
+        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index a05d552ea..8bb01b87f 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -20,51 +20,48 @@ from spacy.util import minibatch, compounding
 
 # training data
 TRAIN_DATA = [
-    ('Who is Shaka Khan?', {
-        'entities': [(7, 17, 'PERSON')]
-    }),
-    ('I like London and Berlin.', {
-        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
-    })
+    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
+    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
 ]
 
 
 @plac.annotations(
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
-    n_iter=("Number of training iterations", "option", "n", int))
+    n_iter=("Number of training iterations", "option", "n", int),
+)
 def main(model=None, output_dir=None, n_iter=100):
     """Load the model, set up the pipeline and train the entity recognizer."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
         print("Loaded model '%s'" % model)
     else:
-        nlp = spacy.blank('en')  # create blank Language class
+        nlp = spacy.blank("en")  # create blank Language class
         print("Created blank 'en' model")
 
     # create the built-in pipeline components and add them to the pipeline
     # nlp.create_pipe works for built-ins that are registered with spaCy
-    if 'ner' not in nlp.pipe_names:
-        ner = nlp.create_pipe('ner')
+    if "ner" not in nlp.pipe_names:
+        ner = nlp.create_pipe("ner")
         nlp.add_pipe(ner, last=True)
     # otherwise, get it so we can add labels
     else:
-        ner = nlp.get_pipe('ner')
+        ner = nlp.get_pipe("ner")
 
     # add labels
     for _, annotations in TRAIN_DATA:
-        for ent in annotations.get('entities'):
+        for ent in annotations.get("entities"):
             ner.add_label(ent[2])
 
     # get names of other pipes to disable them during training
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
     with nlp.disable_pipes(*other_pipes):  # only train NER
         optimizer = nlp.begin_training()
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
             # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
             for batch in batches:
                 texts, annotations = zip(*batch)
                 nlp.update(
@@ -72,14 +69,15 @@ def main(model=None, output_dir=None, n_iter=100):
                     annotations,  # batch of annotations
                     drop=0.5,  # dropout - make it harder to memorise data
                     sgd=optimizer,  # callable to update weights
-                    losses=losses)
-            print('Losses', losses)
+                    losses=losses,
+                )
+            print("Losses", losses)
 
     # test the trained model
     for text, _ in TRAIN_DATA:
         doc = nlp(text)
-        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
-        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
+        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
+        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 
     # save model to output directory
     if output_dir is not None:
@@ -94,11 +92,11 @@ def main(model=None, output_dir=None, n_iter=100):
         nlp2 = spacy.load(output_dir)
         for text, _ in TRAIN_DATA:
             doc = nlp2(text)
-            print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
-            print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
+            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
+            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index 6a4863b8a..656ae1d83 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -35,7 +35,7 @@ from spacy.util import minibatch, compounding
 
 
 # new entity label
-LABEL = 'ANIMAL'
+LABEL = "ANIMAL"
 
 # training data
 # Note: If you're using an existing model, make sure to mix in examples of
@@ -43,29 +43,21 @@ LABEL = 'ANIMAL'
 # model might learn the new type, but "forget" what it previously knew.
 # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
 TRAIN_DATA = [
-    ("Horses are too tall and they pretend to care about your feelings", {
-        'entities': [(0, 6, 'ANIMAL')]
-    }),
-
-    ("Do they bite?", {
-        'entities': []
-    }),
-
-    ("horses are too tall and they pretend to care about your feelings", {
-        'entities': [(0, 6, 'ANIMAL')]
-    }),
-
-    ("horses pretend to care about your feelings", {
-        'entities': [(0, 6, 'ANIMAL')]
-    }),
-
-    ("they pretend to care about your feelings, those horses", {
-        'entities': [(48, 54, 'ANIMAL')]
-    }),
-
-    ("horses?", {
-        'entities': [(0, 6, 'ANIMAL')]
-    })
+    (
+        "Horses are too tall and they pretend to care about your feelings",
+        {"entities": [(0, 6, "ANIMAL")]},
+    ),
+    ("Do they bite?", {"entities": []}),
+    (
+        "horses are too tall and they pretend to care about your feelings",
+        {"entities": [(0, 6, "ANIMAL")]},
+    ),
+    ("horses pretend to care about your feelings", {"entities": [(0, 6, "ANIMAL")]}),
+    (
+        "they pretend to care about your feelings, those horses",
+        {"entities": [(48, 54, "ANIMAL")]},
+    ),
+    ("horses?", {"entities": [(0, 6, "ANIMAL")]}),
 ]
 
 
@@ -73,25 +65,26 @@ TRAIN_DATA = [
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     new_model_name=("New model name for model meta.", "option", "nm", str),
     output_dir=("Optional output directory", "option", "o", Path),
-    n_iter=("Number of training iterations", "option", "n", int))
-def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
+    n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
     """Set up the pipeline and entity recognizer, and train the new entity."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
         print("Loaded model '%s'" % model)
     else:
-        nlp = spacy.blank('en')  # create blank Language class
+        nlp = spacy.blank("en")  # create blank Language class
         print("Created blank 'en' model")
     # Add entity recognizer to model if it's not in the pipeline
     # nlp.create_pipe works for built-ins that are registered with spaCy
-    if 'ner' not in nlp.pipe_names:
-        ner = nlp.create_pipe('ner')
+    if "ner" not in nlp.pipe_names:
+        ner = nlp.create_pipe("ner")
         nlp.add_pipe(ner)
     # otherwise, get it, so we can add labels to it
     else:
-        ner = nlp.get_pipe('ner')
+        ner = nlp.get_pipe("ner")
 
-    ner.add_label(LABEL)   # add new entity label to entity recognizer
+    ner.add_label(LABEL)  # add new entity label to entity recognizer
     if model is None:
         optimizer = nlp.begin_training()
     else:
@@ -100,21 +93,20 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
         optimizer = nlp.entity.create_optimizer()
 
     # get names of other pipes to disable them during training
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
     with nlp.disable_pipes(*other_pipes):  # only train NER
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
             # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
             for batch in batches:
                 texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
-                           losses=losses)
-            print('Losses', losses)
+                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
+            print("Losses", losses)
 
     # test the trained model
-    test_text = 'Do you like horses?'
+    test_text = "Do you like horses?"
     doc = nlp(test_text)
     print("Entities in '%s'" % test_text)
     for ent in doc.ents:
@@ -125,7 +117,7 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
         output_dir = Path(output_dir)
         if not output_dir.exists():
             output_dir.mkdir()
-        nlp.meta['name'] = new_model_name  # rename model
+        nlp.meta["name"] = new_model_name  # rename model
         nlp.to_disk(output_dir)
         print("Saved model to", output_dir)
 
@@ -137,5 +129,5 @@ def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
             print(ent.label_, ent.text)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index f91ead7c4..aff33c88f 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -18,62 +18,69 @@ from spacy.util import minibatch, compounding
 
 # training data
 TRAIN_DATA = [
-    ("They trade mortgage-backed securities.", {
-        'heads': [1, 1, 4, 4, 5, 1, 1],
-        'deps': ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
-    }),
-    ("I like London and Berlin.", {
-        'heads': [1, 1, 1, 2, 2, 1],
-        'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
-    })
+    (
+        "They trade mortgage-backed securities.",
+        {
+            "heads": [1, 1, 4, 4, 5, 1, 1],
+            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
+        },
+    ),
+    (
+        "I like London and Berlin.",
+        {
+            "heads": [1, 1, 1, 2, 2, 1],
+            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
+        },
+    ),
 ]
 
 
 @plac.annotations(
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
-    n_iter=("Number of training iterations", "option", "n", int))
+    n_iter=("Number of training iterations", "option", "n", int),
+)
 def main(model=None, output_dir=None, n_iter=10):
     """Load the model, set up the pipeline and train the parser."""
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
         print("Loaded model '%s'" % model)
     else:
-        nlp = spacy.blank('en')  # create blank Language class
+        nlp = spacy.blank("en")  # create blank Language class
         print("Created blank 'en' model")
 
     # add the parser to the pipeline if it doesn't exist
     # nlp.create_pipe works for built-ins that are registered with spaCy
-    if 'parser' not in nlp.pipe_names:
-        parser = nlp.create_pipe('parser')
+    if "parser" not in nlp.pipe_names:
+        parser = nlp.create_pipe("parser")
         nlp.add_pipe(parser, first=True)
     # otherwise, get it, so we can add labels to it
     else:
-        parser = nlp.get_pipe('parser')
+        parser = nlp.get_pipe("parser")
 
     # add labels to the parser
     for _, annotations in TRAIN_DATA:
-        for dep in annotations.get('deps', []):
+        for dep in annotations.get("deps", []):
             parser.add_label(dep)
 
     # get names of other pipes to disable them during training
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
     with nlp.disable_pipes(*other_pipes):  # only train parser
         optimizer = nlp.begin_training()
         for itn in range(n_iter):
             random.shuffle(TRAIN_DATA)
             losses = {}
             # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
             for batch in batches:
                 texts, annotations = zip(*batch)
                 nlp.update(texts, annotations, sgd=optimizer, losses=losses)
-            print('Losses', losses)
+            print("Losses", losses)
 
     # test the trained model
     test_text = "I like securities."
     doc = nlp(test_text)
-    print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
+    print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
 
     # save model to output directory
     if output_dir is not None:
@@ -87,10 +94,10 @@ def main(model=None, output_dir=None, n_iter=10):
         print("Loading from", output_dir)
         nlp2 = spacy.load(output_dir)
         doc = nlp2(test_text)
-        print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
+        print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # expected result:
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index 0971294e5..0673d06d2 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -25,11 +25,7 @@ from spacy.util import minibatch, compounding
 # http://universaldependencies.github.io/docs/u/pos/index.html
 # You may also specify morphological features for your tags, from the universal
 # scheme.
-TAG_MAP = {
-    'N': {'pos': 'NOUN'},
-    'V': {'pos': 'VERB'},
-    'J': {'pos': 'ADJ'}
-}
+TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}
 
 # Usually you'll read this in, of course. Data formats vary. Ensure your
 # strings are unicode and that the number of tags assigned matches spaCy's
@@ -37,16 +33,17 @@ TAG_MAP = {
 # that specifies the gold-standard tokenization, e.g.:
 # ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
 TRAIN_DATA = [
-    ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
-    ("Eat blue ham", {'tags': ['V', 'J', 'N']})
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
 ]
 
 
 @plac.annotations(
     lang=("ISO Code of language to use", "option", "l", str),
     output_dir=("Optional output directory", "option", "o", Path),
-    n_iter=("Number of training iterations", "option", "n", int))
-def main(lang='en', output_dir=None, n_iter=25):
+    n_iter=("Number of training iterations", "option", "n", int),
+)
+def main(lang="en", output_dir=None, n_iter=25):
     """Create a new model, set up the pipeline and train the tagger. In order to
     train the tagger with a custom tag map, we're creating a new Language
     instance with a custom vocab.
@@ -54,7 +51,7 @@ def main(lang='en', output_dir=None, n_iter=25):
     nlp = spacy.blank(lang)
     # add the tagger to the pipeline
     # nlp.create_pipe works for built-ins that are registered with spaCy
-    tagger = nlp.create_pipe('tagger')
+    tagger = nlp.create_pipe("tagger")
     # Add the tags. This needs to be done before you start training.
     for tag, values in TAG_MAP.items():
         tagger.add_label(tag, values)
@@ -65,16 +62,16 @@ def main(lang='en', output_dir=None, n_iter=25):
         random.shuffle(TRAIN_DATA)
         losses = {}
         # batch up the examples using spaCy's minibatch
-        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
+        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
         for batch in batches:
             texts, annotations = zip(*batch)
             nlp.update(texts, annotations, sgd=optimizer, losses=losses)
-        print('Losses', losses)
+        print("Losses", losses)
 
     # test the trained model
     test_text = "I like blue eggs"
     doc = nlp(test_text)
-    print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
+    print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
 
     # save model to output directory
     if output_dir is not None:
@@ -88,10 +85,10 @@ def main(lang='en', output_dir=None, n_iter=25):
         print("Loading from", output_dir)
         nlp2 = spacy.load(output_dir)
         doc = nlp2(test_text)
-        print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
+        print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
 
     # Expected output:
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 61cd7e51e..73c8e9f2b 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -23,55 +23,62 @@ from spacy.util import minibatch, compounding
     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
     output_dir=("Optional output directory", "option", "o", Path),
     n_texts=("Number of texts to train from", "option", "t", int),
-    n_iter=("Number of training iterations", "option", "n", int))
+    n_iter=("Number of training iterations", "option", "n", int),
+)
 def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
     if model is not None:
         nlp = spacy.load(model)  # load existing spaCy model
         print("Loaded model '%s'" % model)
     else:
-        nlp = spacy.blank('en')  # create blank Language class
+        nlp = spacy.blank("en")  # create blank Language class
         print("Created blank 'en' model")
 
     # add the text classifier to the pipeline if it doesn't exist
     # nlp.create_pipe works for built-ins that are registered with spaCy
-    if 'textcat' not in nlp.pipe_names:
-        textcat = nlp.create_pipe('textcat')
+    if "textcat" not in nlp.pipe_names:
+        textcat = nlp.create_pipe("textcat")
         nlp.add_pipe(textcat, last=True)
     # otherwise, get it, so we can add labels to it
     else:
-        textcat = nlp.get_pipe('textcat')
+        textcat = nlp.get_pipe("textcat")
 
     # add label to text classifier
-    textcat.add_label('POSITIVE')
+    textcat.add_label("POSITIVE")
 
     # load the IMDB dataset
     print("Loading IMDB data...")
     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
-    print("Using {} examples ({} training, {} evaluation)"
-          .format(n_texts, len(train_texts), len(dev_texts)))
-    train_data = list(zip(train_texts,
-                          [{'cats': cats} for cats in train_cats]))
+    print(
+        "Using {} examples ({} training, {} evaluation)".format(
+            n_texts, len(train_texts), len(dev_texts)
+        )
+    )
+    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
 
     # get names of other pipes to disable them during training
-    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
     with nlp.disable_pipes(*other_pipes):  # only train textcat
         optimizer = nlp.begin_training()
         print("Training the model...")
-        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
+        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
         for i in range(n_iter):
             losses = {}
             # batch up the examples using spaCy's minibatch
-            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
             for batch in batches:
                 texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
-                           losses=losses)
+                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
             with textcat.model.use_params(optimizer.averages):
                 # evaluate on the dev data split off in load_data()
                 scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
-            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
-                  .format(losses['textcat'], scores['textcat_p'],
-                          scores['textcat_r'], scores['textcat_f']))
+            print(
+                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
+                    losses["textcat"],
+                    scores["textcat_p"],
+                    scores["textcat_r"],
+                    scores["textcat_f"],
+                )
+            )
 
     # test the trained model
     test_text = "This movie sucked"
@@ -99,7 +106,7 @@ def load_data(limit=0, split=0.8):
     random.shuffle(train_data)
     train_data = train_data[-limit:]
     texts, labels = zip(*train_data)
-    cats = [{'POSITIVE': bool(y)} for y in labels]
+    cats = [{"POSITIVE": bool(y)} for y in labels]
     split = int(len(train_data) * split)
     return (texts[:split], cats[:split]), (texts[split:], cats[split:])
 
@@ -116,9 +123,9 @@ def evaluate(tokenizer, textcat, texts, cats):
             if label not in gold:
                 continue
             if score >= 0.5 and gold[label] >= 0.5:
-                tp += 1.
+                tp += 1.0
             elif score >= 0.5 and gold[label] < 0.5:
-                fp += 1.
+                fp += 1.0
             elif score < 0.5 and gold[label] < 0.5:
                 tn += 1
             elif score < 0.5 and gold[label] >= 0.5:
@@ -126,8 +133,8 @@ def evaluate(tokenizer, textcat, texts, cats):
     precision = tp / (tp + fp)
     recall = tp / (tp + fn)
     f_score = 2 * (precision * recall) / (precision + recall)
-    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
+    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py
index a443b5310..9b34811f7 100644
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@@ -14,8 +14,13 @@ from spacy.language import Language
 
 @plac.annotations(
     vectors_loc=("Path to .vec file", "positional", None, str),
-    lang=("Optional language ID. If not set, blank Language() will be used.",
-          "positional", None, str))
+    lang=(
+        "Optional language ID. If not set, blank Language() will be used.",
+        "positional",
+        None,
+        str,
+    ),
+)
 def main(vectors_loc, lang=None):
     if lang is None:
         nlp = Language()
@@ -24,21 +29,21 @@ def main(vectors_loc, lang=None):
         # save the model to disk and load it back later (models always need a
         # "lang" setting). Use 'xx' for blank multi-language class.
         nlp = spacy.blank(lang)
-    with open(vectors_loc, 'rb') as file_:
+    with open(vectors_loc, "rb") as file_:
         header = file_.readline()
         nr_row, nr_dim = header.split()
         nlp.vocab.reset_vectors(width=int(nr_dim))
         for line in file_:
-            line = line.rstrip().decode('utf8')
-            pieces = line.rsplit(' ', int(nr_dim))
+            line = line.rstrip().decode("utf8")
+            pieces = line.rsplit(" ", int(nr_dim))
             word = pieces[0]
-            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
+            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
             nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
     # test the vectors and similarity
-    text = 'class colspan'
+    text = "class colspan"
     doc = nlp(text)
     print(text, doc[0].similarity(doc[1]))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)
diff --git a/examples/vectors_tensorboard.py b/examples/vectors_tensorboard.py
index f29193345..4cfe7f442 100644
--- a/examples/vectors_tensorboard.py
+++ b/examples/vectors_tensorboard.py
@@ -14,26 +14,45 @@ import plac
 import spacy
 import tensorflow as tf
 import tqdm
-from tensorflow.contrib.tensorboard.plugins.projector import visualize_embeddings, ProjectorConfig
+from tensorflow.contrib.tensorboard.plugins.projector import (
+    visualize_embeddings,
+    ProjectorConfig,
+)
 
 
 @plac.annotations(
     vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
-    out_loc=("Path to output folder for tensorboard session data", "positional", None, str),
-    name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
+    out_loc=(
+        "Path to output folder for tensorboard session data",
+        "positional",
+        None,
+        str,
+    ),
+    name=(
+        "Human readable name for tsv file and vectors tensor",
+        "positional",
+        None,
+        str,
+    ),
 )
 def main(vectors_loc, out_loc, name="spaCy_vectors"):
     meta_file = "{}.tsv".format(name)
     out_meta_file = path.join(out_loc, meta_file)
 
-    print('Loading spaCy vectors model: {}'.format(vectors_loc))
+    print("Loading spaCy vectors model: {}".format(vectors_loc))
     model = spacy.load(vectors_loc)
-    print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
-    strings_stream = tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
+    print("Finding lexemes with vectors attached: {}".format(vectors_loc))
+    strings_stream = tqdm.tqdm(
+        model.vocab.strings, total=len(model.vocab.strings), leave=False
+    )
     queries = [w for w in strings_stream if model.vocab.has_vector(w)]
     vector_count = len(queries)
 
-    print('Building Tensorboard Projector metadata for ({}) vectors: {}'.format(vector_count, out_meta_file))
+    print(
+        "Building Tensorboard Projector metadata for ({}) vectors: {}".format(
+            vector_count, out_meta_file
+        )
+    )
 
     # Store vector data in a tensorflow variable
     tf_vectors_variable = numpy.zeros((vector_count, model.vocab.vectors.shape[1]))
@@ -41,22 +60,26 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
     # Write a tab-separated file that contains information about the vectors for visualization
     #
     # Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
-    with open(out_meta_file, 'wb') as file_metadata:
+    with open(out_meta_file, "wb") as file_metadata:
         # Define columns in the first row
-        file_metadata.write("Text\tFrequency\n".encode('utf-8'))
+        file_metadata.write("Text\tFrequency\n".encode("utf-8"))
         # Write out a row for each vector that we add to the tensorflow variable we created
         vec_index = 0
         for text in tqdm.tqdm(queries, total=len(queries), leave=False):
             # https://github.com/tensorflow/tensorflow/issues/9094
-            text = '<Space>' if text.lstrip() == '' else text
+            text = "<Space>" if text.lstrip() == "" else text
             lex = model.vocab[text]
 
             # Store vector data and metadata
             tf_vectors_variable[vec_index] = model.vocab.get_vector(text)
-            file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode('utf-8'))
+            file_metadata.write(
+                "{}\t{}\n".format(text, math.exp(lex.prob) * vector_count).encode(
+                    "utf-8"
+                )
+            )
             vec_index += 1
 
-    print('Running Tensorflow Session...')
+    print("Running Tensorflow Session...")
     sess = tf.InteractiveSession()
     tf.Variable(tf_vectors_variable, trainable=False, name=name)
     tf.global_variables_initializer().run()
@@ -73,10 +96,10 @@ def main(vectors_loc, out_loc, name="spaCy_vectors"):
     visualize_embeddings(writer, config)
 
     # Save session and print run command to the output
-    print('Saving Tensorboard Session...')
-    saver.save(sess, path.join(out_loc, '{}.ckpt'.format(name)))
-    print('Done. Run `tensorboard --logdir={0}` to view in Tensorboard'.format(out_loc))
+    print("Saving Tensorboard Session...")
+    saver.save(sess, path.join(out_loc, "{}.ckpt".format(name)))
+    print("Done. Run `tensorboard --logdir={0}` to view in Tensorboard".format(out_loc))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     plac.call(main)

From 40b57ea4acdec97534e2cbd74aeeee66456db9bd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 2 Dec 2018 04:28:34 +0100
Subject: [PATCH 05/27] Format example

---
 .../information_extraction/phrase_matcher.py  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py
index 28266bbd1..b49cb88e8 100644
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@@ -55,15 +55,15 @@ import spacy
     patterns_loc=("Path to gazetteer", "positional", None, str),
     text_loc=("Path to Reddit corpus file", "positional", None, str),
     n=("Number of texts to read", "option", "n", int),
-    lang=("Language class to initialise", "option", "l", str))
-def main(patterns_loc, text_loc, n=10000, lang='en'):
-    nlp = spacy.blank('en')
+    lang=("Language class to initialise", "option", "l", str),
+)
+def main(patterns_loc, text_loc, n=10000, lang="en"):
+    nlp = spacy.blank("en")
     nlp.vocab.lex_attr_getters = {}
     phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
     count = 0
     t1 = time.time()
-    for ent_id, text in get_matches(nlp.tokenizer, phrases,
-                                    read_text(text_loc, n=n)):
+    for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
         count += 1
     t2 = time.time()
     print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
@@ -72,7 +72,7 @@ def main(patterns_loc, text_loc, n=10000, lang='en'):
 def read_gazetteer(tokenizer, loc, n=-1):
     for i, line in enumerate(open(loc)):
         data = ujson.loads(line.strip())
-        phrase = tokenizer(data['text'])
+        phrase = tokenizer(data["text"])
         for w in phrase:
             _ = tokenizer.vocab[w.text]
         if len(phrase) >= 2:
@@ -83,14 +83,14 @@ def read_text(bz2_loc, n=10000):
     with BZ2File(bz2_loc) as file_:
         for i, line in enumerate(file_):
             data = ujson.loads(line)
-            yield data['body']
+            yield data["body"]
             if i >= n:
                 break
 
 
 def get_matches(tokenizer, phrases, texts, max_length=6):
     matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
-    matcher.add('Phrase', None, *phrases)
+    matcher.add("Phrase", None, *phrases)
     for text in texts:
         doc = tokenizer(text)
         for w in doc:
@@ -100,10 +100,11 @@ def get_matches(tokenizer, phrases, texts, max_length=6):
             yield (ent_id, doc[start:end].text)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     if False:
         import cProfile
         import pstats
+
         cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
         s = pstats.Stats("Profile.prof")
         s.strip_dirs().sort_stats("time").print_stats()

From f37863093a8c329d9e6e318f36fe7d0ca1cefdf6 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 3 Dec 2018 01:28:22 +0100
Subject: [PATCH 06/27] =?UTF-8?q?=F0=9F=92=AB=20Replace=20ujson,=20msgpack?=
 =?UTF-8?q?=20and=20dill/pickle/cloudpickle=20with=20srsly=20(#3003)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning
---
 CONTRIBUTING.md                               |  3 +-
 bin/load_reddit.py                            |  6 +-
 .../information_extraction/phrase_matcher.py  |  6 +-
 examples/keras_parikh_entailment/__main__.py  |  2 +-
 .../notebooks/Decompositional Attention.ipynb |  2 +-
 requirements.txt                              |  2 +-
 setup.py                                      |  2 +-
 spacy/cli/convert.py                          | 13 ++--
 spacy/cli/converters/jsonl2json.py            |  4 +-
 spacy/cli/debug_data.py                       |  7 +-
 spacy/cli/info.py                             |  3 +-
 spacy/cli/init_model.py                       |  5 +-
 spacy/cli/package.py                          |  7 +-
 spacy/cli/pretrain.py                         | 17 ++---
 spacy/cli/profile.py                          |  4 +-
 spacy/cli/schemas/__init__.py                 |  4 +-
 spacy/cli/train.py                            | 13 ++--
 spacy/cli/ud/ud_run_test.py                   |  7 +-
 spacy/cli/validate.py                         |  5 +-
 spacy/compat.py                               |  7 --
 spacy/gold.pyx                                | 16 ++--
 spacy/language.py                             | 12 +--
 spacy/pipeline.pyx                            | 43 +++++------
 spacy/strings.pyx                             | 13 ++--
 spacy/syntax/_parser_model.pyx                | 15 ++--
 spacy/syntax/nn_parser.pyx                    | 13 ++--
 spacy/syntax/transition_system.pyx            | 11 ++-
 spacy/tests/util.py                           |  6 +-
 spacy/tokens/_serialize.py                    |  6 +-
 spacy/tokens/doc.pyx                          | 15 ++--
 spacy/util.py                                 | 74 ++-----------------
 spacy/vectors.pyx                             | 17 ++---
 website/api/_top-level/_compat.jade           |  8 +-
 33 files changed, 130 insertions(+), 238 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cb10a1718..22cad91d6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()`
 helper function.
 
 ```python
-from .compat import unicode_, json_dumps, is_config
+from .compat import unicode_, is_config
 
 compatible_unicode = unicode_('hello world')
-compatible_json = json_dumps({'key': 'value'})
 if is_config(windows=True, python2=True):
     print("You are using Python 2 on Windows.")
 ```
diff --git a/bin/load_reddit.py b/bin/load_reddit.py
index 5affa0fb5..507ce58c2 100644
--- a/bin/load_reddit.py
+++ b/bin/load_reddit.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 
 import bz2
 import regex as re
-import ujson
+import srsly
 import sys
 import random
 import datetime
@@ -44,7 +44,7 @@ class Reddit(object):
                     line = line.strip()
                     if not line:
                         continue
-                    comment = ujson.loads(line)
+                    comment = srsly.json_loads(line)
                     if self.is_valid(comment):
                         text = self.strip_tags(comment["body"])
                         yield {"text": text}
@@ -75,7 +75,7 @@ class Reddit(object):
 def main(path):
     reddit = Reddit(path)
     for comment in reddit:
-        print(ujson.dumps(comment))
+        print(srsly.json_dumps(comment))
 
 
 if __name__ == "__main__":
diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py
index b49cb88e8..3cdc9cc86 100644
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@@ -45,7 +45,7 @@ from __future__ import print_function, unicode_literals, division
 from bz2 import BZ2File
 import time
 import plac
-import ujson
+import json
 
 from spacy.matcher import PhraseMatcher
 import spacy
@@ -71,7 +71,7 @@ def main(patterns_loc, text_loc, n=10000, lang="en"):
 
 def read_gazetteer(tokenizer, loc, n=-1):
     for i, line in enumerate(open(loc)):
-        data = ujson.loads(line.strip())
+        data = json.loads(line.strip())
         phrase = tokenizer(data["text"])
         for w in phrase:
             _ = tokenizer.vocab[w.text]
@@ -82,7 +82,7 @@ def read_gazetteer(tokenizer, loc, n=-1):
 def read_text(bz2_loc, n=10000):
     with BZ2File(bz2_loc) as file_:
         for i, line in enumerate(file_):
-            data = ujson.loads(line)
+            data = json.loads(line)
             yield data["body"]
             if i >= n:
                 break
diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py
index 7cd66a20c..14df8e3d4 100644
--- a/examples/keras_parikh_entailment/__main__.py
+++ b/examples/keras_parikh_entailment/__main__.py
@@ -1,5 +1,5 @@
 import numpy as np
-import ujson as json
+import json
 from keras.utils import to_categorical
 import plac
 import sys
diff --git a/examples/notebooks/Decompositional Attention.ipynb b/examples/notebooks/Decompositional Attention.ipynb
index b61dc9df7..8baaf7d33 100644
--- a/examples/notebooks/Decompositional Attention.ipynb	
+++ b/examples/notebooks/Decompositional Attention.ipynb	
@@ -77,7 +77,7 @@
     }
    ],
    "source": [
-    "import ujson as json\n",
+    "import json\n",
     "from keras.utils import to_categorical\n",
     "\n",
     "LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
diff --git a/requirements.txt b/requirements.txt
index 3d495277e..d68ac7a31 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,12 +6,12 @@ blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
-ujson>=1.35
 dill>=0.2,<0.3
 regex==2018.01.10
 requests>=2.13.0,<3.0.0
 jsonschema>=2.6.0,<3.0.0
 wasabi>=0.0.8,<1.1.0
+srsly>=0.0.4,<1.1.0
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 cython>=0.25
diff --git a/setup.py b/setup.py
index 05d074f28..99ae655bb 100755
--- a/setup.py
+++ b/setup.py
@@ -203,12 +203,12 @@ def setup_package():
                 "thinc==7.0.0.dev4",
                 "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
-                "ujson>=1.35",
                 "regex==2018.01.10",
                 "dill>=0.2,<0.3",
                 "requests>=2.13.0,<3.0.0",
                 "jsonschema>=2.6.0,<3.0.0",
                 "wasabi>=0.0.8,<1.1.0",
+                "srsly>=0.0.4,<1.1.0",
                 'pathlib==1.0.1; python_version < "3.4"',
             ],
             setup_requires=["wheel"],
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index b41b22036..a2c1d20e0 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -4,9 +4,9 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path
 from wasabi import Printer
+import srsly
 
-from ..util import write_jsonl, write_json
-from ..compat import json_dumps, path2str
+from ..compat import path2str
 from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
 from .converters import ner_jsonl2json
 from ._messages import Messages
@@ -77,9 +77,9 @@ def convert(
         suffix = ".{}".format(file_type)
         output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
         if file_type == "json":
-            write_json(output_file, data)
+            srsly.write_json(output_file, data)
         elif file_type == "jsonl":
-            write_jsonl(output_file, data)
+            srsly.write_jsonl(output_file, data)
         msg.good(
             Messages.M032.format(name=path2str(output_file)),
             Messages.M033.format(n_docs=len(data)),
@@ -87,7 +87,6 @@ def convert(
     else:
         # Print to stdout
         if file_type == "json":
-            print(json_dumps(data))
+            srsly.write_json("-", data)
         elif file_type == "jsonl":
-            for line in data:
-                print(json_dumps(line))
+            srsly.write_jsonl("-", data)
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py
index 26fdca302..a281db86d 100644
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import ujson
+import srsly
 
 from ...util import get_lang_class
 from .._messages import Messages
@@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
     if lang is None:
         raise ValueError(Messages.M054)
     json_docs = []
-    input_tuples = [ujson.loads(line) for line in input_data]
+    input_tuples = [srsly.json_loads(line) for line in input_data]
     nlp = get_lang_class(lang)()
     for i, (raw_text, ents) in enumerate(input_tuples):
         doc = nlp.make_doc(raw_text)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 5bf602828..06f648124 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -5,10 +5,11 @@ from pathlib import Path
 from collections import Counter
 import plac
 import sys
+import srsly
 from wasabi import Printer, MESSAGES
 
 from ..gold import GoldCorpus, read_json_object
-from ..util import load_model, get_lang_class, read_json, read_jsonl
+from ..util import load_model, get_lang_class
 
 # from .schemas import get_schema, validate_json
 from ._messages import Messages
@@ -320,11 +321,11 @@ def debug_data(
 def _load_file(file_path, msg):
     file_name = file_path.parts[-1]
     if file_path.suffix == ".json":
-        data = read_json(file_path)
+        data = srsly.read_json(file_path)
         msg.good("Loaded {}".format(file_name))
         return data
     elif file_path.suffix == ".jsonl":
-        data = read_jsonl(file_path)
+        data = srsly.read_jsonl(file_path)
         msg.good("Loaded {}".format(file_name))
         return data
     msg.fail(
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 5df9ddadb..7339faaab 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -5,6 +5,7 @@ import plac
 import platform
 from pathlib import Path
 from wasabi import Printer
+import srsly
 
 from ._messages import Messages
 from ..compat import path2str, basestring_, unicode_
@@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False):
         meta_path = model_path / "meta.json"
         if not meta_path.is_file():
             msg.fail(Messages.M020, meta_path, exits=1)
-        meta = util.read_json(meta_path)
+        meta = srsly.read_json(meta_path)
         if model_path.resolve() != model_path:
             meta["link"] = path2str(model_path)
             meta["source"] = path2str(model_path.resolve())
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 4b3406ab0..8dc2a8cf2 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -11,12 +11,13 @@ from preshed.counter import PreshCounter
 import tarfile
 import gzip
 import zipfile
+import srsly
 from wasabi import Printer
 
 from ._messages import Messages
 from ..vectors import Vectors
 from ..errors import Errors, Warnings, user_warning
-from ..util import ensure_path, get_lang_class, read_jsonl
+from ..util import ensure_path, get_lang_class
 
 try:
     import ftfy
@@ -59,7 +60,7 @@ def init_model(
                 settings.append("-c")
             msg.warn(Messages.M063, Messages.M064)
         jsonl_loc = ensure_path(jsonl_loc)
-        lex_attrs = read_jsonl(jsonl_loc)
+        lex_attrs = srsly.read_jsonl(jsonl_loc)
     else:
         clusters_loc = ensure_path(clusters_loc)
         freqs_loc = ensure_path(freqs_loc)
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 84288ac72..916dbc1f2 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -5,9 +5,10 @@ import plac
 import shutil
 from pathlib import Path
 from wasabi import Printer, get_raw_input
+import srsly
 
 from ._messages import Messages
-from ..compat import path2str, json_dumps
+from ..compat import path2str
 from .. import util
 from .. import about
 
@@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
 
     meta_path = meta_path or input_path / "meta.json"
     if meta_path.is_file():
-        meta = util.read_json(meta_path)
+        meta = srsly.read_json(meta_path)
         if not create_meta:  # only print if user doesn't want to overwrite
             msg.good(Messages.M041, meta_path)
         else:
@@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
             )
     Path.mkdir(package_path, parents=True)
     shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
-    create_file(main_path / "meta.json", json_dumps(meta))
+    create_file(main_path / "meta.json", srsly.json_dumps(meta))
     create_file(main_path / "setup.py", TEMPLATE_SETUP)
     create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
     create_file(package_path / "__init__.py", TEMPLATE_INIT)
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 20d097047..70cab05c2 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -5,8 +5,6 @@ import plac
 import random
 import numpy
 import time
-import ujson
-import sys
 from collections import Counter
 from pathlib import Path
 from thinc.v2v import Affine, Maxout
@@ -14,10 +12,10 @@ from thinc.api import wrap
 from thinc.misc import LayerNorm as LN
 from thinc.neural.util import prefer_gpu
 from wasabi import Printer
+import srsly
 
 from ..tokens import Doc
 from ..attrs import ID, HEAD
-from ..compat import json_dumps
 from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
 from .. import util
 
@@ -72,7 +70,7 @@ def pretrain(
     if not output_dir.exists():
         output_dir.mkdir()
         msg.good("Created output directory")
-    util.write_json(output_dir / "config.json", config)
+    srsly.write_json(output_dir / "config.json", config)
     msg.good("Saved settings to config.json")
 
     # Load texts from file or stdin
@@ -81,12 +79,12 @@ def pretrain(
         if not texts_loc.exists():
             msg.fail("Input text file doesn't exist", texts_loc, exits=1)
         with msg.loading("Loading input texts..."):
-            texts = list(util.read_jsonl(texts_loc))
+            texts = list(srsly.read_jsonl(texts_loc))
         msg.good("Loaded input texts")
         random.shuffle(texts)
     else:  # reading from stdin
         msg.text("Reading input text from stdin...")
-        texts = stream_texts()
+        texts = srsly.read_jsonl("-")
 
     with msg.loading("Loading model '{}'...".format(vectors_model)):
         nlp = util.load_model(vectors_model)
@@ -130,18 +128,13 @@ def pretrain(
                 "epoch": epoch,
             }
             with (output_dir / "log.jsonl").open("a") as file_:
-                file_.write(json_dumps(log) + "\n")
+                file_.write(srsly.json_dumps(log) + "\n")
         tracker.epoch_loss = 0.0
         if texts_loc != "-":
             # Reshuffle the texts if texts were loaded from a file
             random.shuffle(texts)
 
 
-def stream_texts():
-    for line in sys.stdin:
-        yield ujson.loads(line)
-
-
 def make_update(model, docs, optimizer, drop=0.0):
     """Perform an update over a single batch of documents.
 
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 506e55871..439ef79a1 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
 
 import plac
 from pathlib import Path
-import ujson
+import srsly
 import cProfile
 import pstats
 import sys
@@ -64,6 +64,6 @@ def _read_inputs(loc, msg):
         msg.info("Using data from {}".format(input_path.parts[-1]))
         file_ = input_path.open()
     for line in file_:
-        data = ujson.loads(line)
+        data = srsly.json_loads(line)
         text = data["text"]
         yield text
diff --git a/spacy/cli/schemas/__init__.py b/spacy/cli/schemas/__init__.py
index f478c7a9a..c502c6493 100644
--- a/spacy/cli/schemas/__init__.py
+++ b/spacy/cli/schemas/__init__.py
@@ -3,9 +3,9 @@ from __future__ import unicode_literals
 
 from pathlib import Path
 from jsonschema import Draft4Validator
+import srsly
 
 from ...errors import Errors
-from ...util import read_json
 
 
 SCHEMAS = {}
@@ -25,7 +25,7 @@ def get_schema(name):
         schema_path = Path(__file__).parent / "{}.json".format(name)
         if not schema_path.exists():
             raise ValueError(Errors.E104.format(name=name))
-        schema = read_json(schema_path)
+        schema = srsly.read_json(schema_path)
         # TODO: replace with (stable) Draft6Validator, if available
         validator = Draft4Validator(schema)
         validator.check_schema(schema)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 9dec5d4bd..8d322e32d 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -7,6 +7,7 @@ import tqdm
 from thinc.neural._classes.model import Model
 from timeit import default_timer as timer
 import shutil
+import srsly
 from wasabi import Printer
 
 from ._messages import Messages
@@ -111,7 +112,7 @@ def train(
         msg.fail(Messages.M051, dev_path, exits=1)
     if meta_path is not None and not meta_path.exists():
         msg.fail(Messages.M020, meta_path, exits=1)
-    meta = util.read_json(meta_path) if meta_path else {}
+    meta = srsly.read_json(meta_path) if meta_path else {}
     if not isinstance(meta, dict):
         msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
     if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
@@ -226,7 +227,7 @@ def train(
                         end_time = timer()
                         cpu_wps = nwords / (end_time - start_time)
                 acc_loc = output_path / ("model%d" % i) / "accuracy.json"
-                util.write_json(acc_loc, scorer.scores)
+                srsly.write_json(acc_loc, scorer.scores)
 
                 # Update model meta.json
                 meta["lang"] = nlp.lang
@@ -242,7 +243,7 @@ def train(
                 meta.setdefault("name", "model%d" % i)
                 meta.setdefault("version", version)
                 meta_loc = output_path / ("model%d" % i) / "meta.json"
-                util.write_json(meta_loc, meta)
+                srsly.write_json(meta_loc, meta)
 
                 util.set_env_log(verbose)
 
@@ -293,17 +294,17 @@ def _collate_best_model(meta, output_path, components):
     for component, best_component_src in bests.items():
         shutil.rmtree(best_dest / component)
         shutil.copytree(best_component_src / component, best_dest / component)
-        accs = util.read_json(best_component_src / "accuracy.json")
+        accs = srsly.read_json(best_component_src / "accuracy.json")
         for metric in _get_metrics(component):
             meta["accuracy"][metric] = accs[metric]
-    util.write_json(best_dest / "meta.json", meta)
+    srsly.write_json(best_dest / "meta.json", meta)
 
 
 def _find_best(experiment_dir, component):
     accuracies = []
     for epoch_model in experiment_dir.iterdir():
         if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
-            accs = util.read_json(epoch_model / "accuracy.json")
+            accs = srsly.read_json(epoch_model / "accuracy.json")
             scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
             accuracies.append((scores, epoch_model))
     if accuracies:
diff --git a/spacy/cli/ud/ud_run_test.py b/spacy/cli/ud/ud_run_test.py
index f36df2f80..e3771fa92 100644
--- a/spacy/cli/ud/ud_run_test.py
+++ b/spacy/cli/ud/ud_run_test.py
@@ -9,7 +9,7 @@ import tqdm
 from pathlib import Path
 import re
 import sys
-import json
+import srsly
 
 import spacy
 import spacy.util
@@ -44,7 +44,7 @@ from ...lang import ru
 # Data reading #
 ################
 
-space_re = re.compile("\s+")
+space_re = re.compile(r"\s+")
 
 
 def split_text(text):
@@ -332,8 +332,7 @@ def main(test_data_dir, experiment_dir, corpus):
                 / corpus
                 / "{section}-accuracy.json".format(section=section)
             )
-            with open(acc_path, "w") as file_:
-                file_.write(json.dumps(accuracy, indent=2))
+            srsly.write_json(acc_path, accuracy)
 
 
 if __name__ == "__main__":
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index caeaf5ca9..4b5581972 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -5,11 +5,12 @@ import pkg_resources
 from pathlib import Path
 import sys
 import requests
+import srsly
 from wasabi import Printer
 
 from ._messages import Messages
 from ..compat import path2str
-from ..util import get_data_path, read_json
+from ..util import get_data_path
 from .. import about
 
 
@@ -84,7 +85,7 @@ def get_model_links(compat):
             meta_path = Path(model) / "meta.json"
             if not meta_path.exists():
                 continue
-            meta = read_json(meta_path)
+            meta = srsly.read_json(meta_path)
             link = model.parts[-1]
             name = meta["lang"] + "_" + meta["name"]
             links[link] = {
diff --git a/spacy/compat.py b/spacy/compat.py
index f00e2c417..c1869b85f 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
 
 import os
 import sys
-import ujson
 import itertools
 
 from thinc.neural.util import copy_array
@@ -54,9 +53,6 @@ if is_python2:
     unicode_ = unicode  # noqa: F821
     basestring_ = basestring  # noqa: F821
     input_ = raw_input  # noqa: F821
-    json_dumps = lambda data, indent=2: ujson.dumps(
-        data, indent=indent, escape_forward_slashes=False
-    ).decode("utf8")
     path2str = lambda path: str(path).decode("utf8")
 
 elif is_python3:
@@ -64,9 +60,6 @@ elif is_python3:
     unicode_ = str
     basestring_ = str
     input_ = input
-    json_dumps = lambda data, indent=2: ujson.dumps(
-        data, indent=indent, escape_forward_slashes=False
-    )
     path2str = lambda path: str(path)
 
 
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 26ff9753a..9c0c00652 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -10,10 +10,7 @@ import numpy
 import tempfile
 import shutil
 from pathlib import Path
-import msgpack
-import json
-
-import ujson
+import srsly
 
 from . import _align
 from .syntax import nonproj
@@ -21,7 +18,6 @@ from .tokens import Doc
 from .errors import Errors
 from . import util
 from .util import minibatch, itershuffle
-from .compat import json_dumps
 
 from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
 
@@ -123,12 +119,11 @@ class GoldCorpus(object):
             directory.mkdir()
         n = 0
         for i, doc_tuple in enumerate(doc_tuples):
-            with open(directory / '{}.msg'.format(i), 'wb') as file_:
-                msgpack.dump([doc_tuple], file_, use_bin_type=True)
+            srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple])
             n += len(doc_tuple[1])
             if limit and n >= limit:
                 break
-    
+
     @staticmethod
     def walk_corpus(path):
         path = util.ensure_path(path)
@@ -157,8 +152,7 @@ class GoldCorpus(object):
             if loc.parts[-1].endswith('json'):
                 gold_tuples = read_json_file(loc)
             elif loc.parts[-1].endswith('msg'):
-                with loc.open('rb') as file_:
-                    gold_tuples = msgpack.load(file_, raw=False)
+                gold_tuples = srsly.read_msgpack(loc)
             else:
                 msg = "Cannot read from file: %s. Supported formats: .json, .msg"
                 raise ValueError(msg % loc)
@@ -378,7 +372,7 @@ def _json_iterate(loc):
             if square_depth == 1 and curly_depth == 0:
                 py_str = py_raw[start : i+1].decode('utf8')
                 try:
-                    yield json.loads(py_str)
+                    yield srsly.json_loads(py_str)
                 except Exception:
                     print(py_str)
                     raise
diff --git a/spacy/language.py b/spacy/language.py
index f8afe84f7..4c3bfd5c8 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -2,7 +2,6 @@
 from __future__ import absolute_import, unicode_literals
 
 import random
-import ujson
 import itertools
 import weakref
 import functools
@@ -10,6 +9,7 @@ from collections import OrderedDict
 from contextlib import contextmanager
 from copy import copy
 from thinc.neural import Model
+import srsly
 
 from .tokenizer import Tokenizer
 from .vocab import Vocab
@@ -18,7 +18,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
 from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
 from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .pipeline import EntityRuler
-from .compat import json_dumps, izip, basestring_
+from .compat import izip, basestring_
 from .gold import GoldParse
 from .scorer import Scorer
 from ._ml import link_vectors_to_models, create_default_optimizer
@@ -640,7 +640,7 @@ class Language(object):
         serializers = OrderedDict(
             (
                 ("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
-                ("meta.json", lambda p: p.open("w").write(json_dumps(self.meta))),
+                ("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
             )
         )
         for name, proc in self.pipeline:
@@ -671,7 +671,7 @@ class Language(object):
         path = util.ensure_path(path)
         deserializers = OrderedDict(
             (
-                ("meta.json", lambda p: self.meta.update(util.read_json(p))),
+                ("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
                 (
                     "vocab",
                     lambda p: (
@@ -705,7 +705,7 @@ class Language(object):
             (
                 ("vocab", lambda: self.vocab.to_bytes()),
                 ("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
-                ("meta", lambda: json_dumps(self.meta)),
+                ("meta", lambda: srsly.json_dumps(self.meta)),
             )
         )
         for i, (name, proc) in enumerate(self.pipeline):
@@ -725,7 +725,7 @@ class Language(object):
         """
         deserializers = OrderedDict(
             (
-                ("meta", lambda b: self.meta.update(ujson.loads(b))),
+                ("meta", lambda b: self.meta.update(srsly.json_loads(b))),
                 (
                     "vocab",
                     lambda b: (
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 3a09af644..c3b8f5fae 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -7,10 +7,7 @@ import numpy
 cimport numpy as np
 import cytoolz
 from collections import OrderedDict, defaultdict
-import ujson
-
-from .util import msgpack
-from .util import msgpack_numpy
+import srsly
 
 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
@@ -27,7 +24,6 @@ from .syntax.arc_eager cimport ArcEager
 from .morphology cimport Morphology
 from .vocab cimport Vocab
 from .syntax import nonproj
-from .compat import json_dumps
 from .matcher import Matcher
 
 from .matcher import Matcher, PhraseMatcher
@@ -38,7 +34,7 @@ from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
 from ._ml import link_vectors_to_models, zero_init, flatten
 from ._ml import create_default_optimizer
 from .errors import Errors, TempErrors
-from .compat import json_dumps, basestring_
+from .compat import basestring_
 from . import util
 
 
@@ -235,7 +231,7 @@ class EntityRuler(object):
         **kwargs: Other config paramters, mostly for consistency.
         RETURNS (EntityRuler): The loaded entity ruler.
         """
-        patterns = msgpack.loads(patterns_bytes, raw=False)
+        patterns = srsly.msgpack_loads(patterns_bytes)
         self.add_patterns(patterns)
         return self
 
@@ -244,7 +240,7 @@ class EntityRuler(object):
 
         RETURNS (bytes): The serialized patterns.
         """
-        return msgpack.dumps(self.patterns, use_bin_type=True)
+        return srsly.msgpack_dumps(self.patterns)
 
     def from_disk(self, path, **kwargs):
         """Load the entity ruler from a file. Expects a file containing
@@ -256,7 +252,7 @@ class EntityRuler(object):
         """
         path = util.ensure_path(path)
         path = path.with_suffix('.jsonl')
-        patterns = util.read_jsonl(path)
+        patterns = srsly.read_jsonl(path)
         self.add_patterns(patterns)
         return self
 
@@ -270,8 +266,7 @@ class EntityRuler(object):
         """
         path = util.ensure_path(path)
         path = path.with_suffix('.jsonl')
-        data = [json_dumps(line, indent=0) for line in self.patterns]
-        path.open('w').write('\n'.join(data))
+        srsly.write_jsonl(path, self.patterns)
 
 
 class Pipe(object):
@@ -368,7 +363,7 @@ class Pipe(object):
     def to_bytes(self, **exclude):
         """Serialize the pipe to a bytestring."""
         serialize = OrderedDict()
-        serialize['cfg'] = lambda: json_dumps(self.cfg)
+        serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
         if self.model in (True, False, None):
             serialize['model'] = lambda: self.model
         else:
@@ -387,7 +382,7 @@ class Pipe(object):
             self.model.from_bytes(b)
 
         deserialize = OrderedDict((
-            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
             ('vocab', lambda b: self.vocab.from_bytes(b)),
             ('model', load_model),
         ))
@@ -397,7 +392,7 @@ class Pipe(object):
     def to_disk(self, path, **exclude):
         """Serialize the pipe to disk."""
         serialize = OrderedDict()
-        serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
+        serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
         serialize['vocab'] = lambda p: self.vocab.to_disk(p)
         if self.model not in (None, True, False):
             serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
@@ -424,8 +419,7 @@ class Pipe(object):
 
 def _load_cfg(path):
     if path.exists():
-        with path.open() as file_:
-            return ujson.load(file_)
+        return srsly.read_json(path)
     else:
         return {}
 
@@ -745,10 +739,9 @@ class Tagger(Pipe):
         else:
             serialize['model'] = self.model.to_bytes
         serialize['vocab'] = self.vocab.to_bytes
-        serialize['cfg'] = lambda: ujson.dumps(self.cfg)
+        serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
         tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
-        serialize['tag_map'] = lambda: msgpack.dumps(
-            tag_map, use_bin_type=True)
+        serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map)
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(self, bytes_data, **exclude):
@@ -766,7 +759,7 @@ class Tagger(Pipe):
             self.model.from_bytes(b)
 
         def load_tag_map(b):
-            tag_map = msgpack.loads(b, raw=False)
+            tag_map = srsly.msgpack_loads(b)
             self.vocab.morphology = Morphology(
                 self.vocab.strings, tag_map=tag_map,
                 lemmatizer=self.vocab.morphology.lemmatizer,
@@ -775,7 +768,7 @@ class Tagger(Pipe):
         deserialize = OrderedDict((
             ('vocab', lambda b: self.vocab.from_bytes(b)),
             ('tag_map', load_tag_map),
-            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
+            ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
             ('model', lambda b: load_model(b)),
         ))
         util.from_bytes(bytes_data, deserialize, exclude)
@@ -785,10 +778,9 @@ class Tagger(Pipe):
         tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
         serialize = OrderedDict((
             ('vocab', lambda p: self.vocab.to_disk(p)),
-            ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
-                tag_map, use_bin_type=True))),
+            ('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
             ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
-            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
+            ('cfg', lambda p: srsly.write_json(p, self.cfg))
         ))
         util.to_disk(path, serialize, exclude)
 
@@ -803,8 +795,7 @@ class Tagger(Pipe):
                 self.model.from_bytes(file_.read())
 
         def load_tag_map(p):
-            with p.open('rb') as file_:
-                tag_map = msgpack.loads(file_.read(), raw=False)
+            tag_map = srsly.read_msgpack(p)
             self.vocab.morphology = Morphology(
                 self.vocab.strings, tag_map=tag_map,
                 lemmatizer=self.vocab.morphology.lemmatizer,
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index b54e3f155..2c8d5fcb4 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -7,12 +7,11 @@ from libc.string cimport memcpy
 from libcpp.set cimport set
 from libc.stdint cimport uint32_t
 from murmurhash.mrmr cimport hash64, hash32
-import ujson
+import srsly
 
 from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 from .typedefs cimport hash_t
-from .compat import json_dumps
 from .errors import Errors
 from . import util
 
@@ -197,8 +196,7 @@ cdef class StringStore:
         """
         path = util.ensure_path(path)
         strings = list(self)
-        with path.open('w') as file_:
-            file_.write(json_dumps(strings))
+        srsly.write_json(path, strings)
 
     def from_disk(self, path):
         """Loads state from a directory. Modifies the object in place and
@@ -209,8 +207,7 @@ cdef class StringStore:
         RETURNS (StringStore): The modified `StringStore` object.
         """
         path = util.ensure_path(path)
-        with path.open('r') as file_:
-            strings = ujson.load(file_)
+        strings = srsly.read_json(path)
         prev = list(self)
         self._reset_and_load(strings)
         for word in prev:
@@ -223,7 +220,7 @@ cdef class StringStore:
         **exclude: Named attributes to prevent from being serialized.
         RETURNS (bytes): The serialized form of the `StringStore` object.
         """
-        return json_dumps(list(self))
+        return srsly.json_dumps(list(self))
 
     def from_bytes(self, bytes_data, **exclude):
         """Load state from a binary string.
@@ -232,7 +229,7 @@ cdef class StringStore:
         **exclude: Named attributes to prevent from being loaded.
         RETURNS (StringStore): The `StringStore` object.
         """
-        strings = ujson.loads(bytes_data)
+        strings = srsly.json_loads(bytes_data)
         prev = list(self)
         self._reset_and_load(strings)
         for word in prev:
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index cfaa8ddf0..9796193f6 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -5,8 +5,6 @@
 from __future__ import unicode_literals, print_function
 
 from collections import OrderedDict
-import ujson
-import json
 import numpy
 cimport cython.parallel
 import cytoolz
@@ -29,7 +27,7 @@ cimport blis.cy
 
 from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
 from .._ml import link_vectors_to_models, create_default_optimizer
-from ..compat import json_dumps, copy_array
+from ..compat import copy_array
 from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
 from ..errors import Errors, TempErrors
@@ -119,7 +117,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
         VecVec.add_i(&A.scores[i*n.classes],
             W.hidden_bias, 1., n.classes)
 
-            
+
 cdef void sum_state_features(float* output,
         const float* cached, const int* token_ids, int B, int F, int O) nogil:
     cdef int idx, b, f, i
@@ -165,7 +163,7 @@ cdef void cpu_log_loss(float* d_scores,
         else:
             d_scores[i] = exp(scores[i]-max_) / Z
 
- 
+
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
         const int* is_valid, int n) nogil:
     # Find minimum cost
@@ -218,15 +216,15 @@ class ParserModel(Model):
 
     def begin_training(self, X, y=None):
         self.lower.begin_training(X, y=y)
-   
+
     @property
     def tok2vec(self):
         return self._layers[0]
-    
+
     @property
     def lower(self):
         return self._layers[1]
-    
+
     @property
     def upper(self):
         return self._layers[2]
@@ -405,4 +403,3 @@ cdef class precompute_hiddens:
             else:
                 return self.ops.backprop_maxout(d_best, mask, self.nP)
         return state_vector, backprop_nonlinearity
-
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 0663c1289..186c5c16c 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -5,8 +5,6 @@
 from __future__ import unicode_literals, print_function
 
 from collections import OrderedDict
-import ujson
-import json
 import numpy
 cimport cython.parallel
 import cytoolz
@@ -27,6 +25,7 @@ from thinc.misc import LayerNorm
 from thinc.neural.ops import CupyOps
 from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
+import srsly
 
 from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
 from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
@@ -34,7 +33,7 @@ from ._parser_model cimport get_c_weights, get_c_sizes
 from ._parser_model import ParserModel
 from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
 from .._ml import link_vectors_to_models, create_default_optimizer
-from ..compat import json_dumps, copy_array
+from ..compat import copy_array
 from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
 from ..errors import Errors, TempErrors
@@ -539,7 +538,7 @@ cdef class Parser:
             'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
             'vocab': lambda p: self.vocab.to_disk(p),
             'moves': lambda p: self.moves.to_disk(p, strings=False),
-            'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
+            'cfg': lambda p: srsly.write_json(p, self.cfg)
         }
         util.to_disk(path, serializers, exclude)
 
@@ -547,7 +546,7 @@ cdef class Parser:
         deserializers = {
             'vocab': lambda p: self.vocab.from_disk(p),
             'moves': lambda p: self.moves.from_disk(p, strings=False),
-            'cfg': lambda p: self.cfg.update(util.read_json(p)),
+            'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
             'model': lambda p: None
         }
         util.from_disk(path, deserializers, exclude)
@@ -568,7 +567,7 @@ cdef class Parser:
             ('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
             ('vocab', lambda: self.vocab.to_bytes()),
             ('moves', lambda: self.moves.to_bytes(strings=False)),
-            ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
+            ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
         ))
         return util.to_bytes(serializers, exclude)
 
@@ -576,7 +575,7 @@ cdef class Parser:
         deserializers = OrderedDict((
             ('vocab', lambda b: self.vocab.from_bytes(b)),
             ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
-            ('cfg', lambda b: self.cfg.update(json.loads(b))),
+            ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
             ('model', lambda b: None)
         ))
         msg = util.from_bytes(bytes_data, deserializers, exclude)
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index fc84fc23a..6d64a4fb4 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -7,14 +7,13 @@ from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
 from collections import OrderedDict, Counter
-import ujson
+import srsly
 
 from . cimport _beam_utils
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..typedefs cimport attr_t
-from ..compat import json_dumps
 from ..errors import Errors
 from .. import util
 
@@ -153,13 +152,13 @@ cdef class TransitionSystem:
             # Make sure we take a copy here, and that we get a Counter
             self.labels[action] = Counter()
             # Have to be careful here: Sorting must be stable, or our model
-            # won't be read back in correctly. 
+            # won't be read back in correctly.
             sorted_labels = [(f, L) for L, f in label_freqs.items()]
             sorted_labels.sort()
             sorted_labels.reverse()
             for freq, label_str in sorted_labels:
                 self.add_action(int(action), label_str)
-                self.labels[action][label_str] = freq 
+                self.labels[action][label_str] = freq
 
     def add_action(self, int action, label_name):
         cdef attr_t label_id
@@ -204,7 +203,7 @@ cdef class TransitionSystem:
     def to_bytes(self, **exclude):
         transitions = []
         serializers = {
-            'moves': lambda: json_dumps(self.labels),
+            'moves': lambda: srsly.json_dumps(self.labels),
             'strings': lambda: self.strings.to_bytes()
         }
         return util.to_bytes(serializers, exclude)
@@ -212,7 +211,7 @@ cdef class TransitionSystem:
     def from_bytes(self, bytes_data, **exclude):
         labels = {}
         deserializers = {
-            'moves': lambda b: labels.update(ujson.loads(b)),
+            'moves': lambda b: labels.update(srsly.json_loads(b)),
             'strings': lambda b: self.strings.from_bytes(b)
         }
         msg = util.from_bytes(bytes_data, deserializers, exclude)
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 80fbb5b1c..175480fe7 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -5,7 +5,7 @@ import numpy
 import tempfile
 import shutil
 import contextlib
-import msgpack
+import srsly
 from pathlib import Path
 from spacy.tokens import Doc, Span
 from spacy.attrs import POS, HEAD, DEP
@@ -100,8 +100,8 @@ def assert_docs_equal(doc1, doc2):
 
 def assert_packed_msg_equal(b1, b2):
     """Assert that two packed msgpack messages are equal."""
-    msg1 = msgpack.loads(b1, encoding="utf8")
-    msg2 = msgpack.loads(b2, encoding="utf8")
+    msg1 = srsly.msgpack_loads(b1)
+    msg2 = srsly.msgpack_loads(b2)
     assert sorted(msg1.keys()) == sorted(msg2.keys())
     for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
         assert k1 == k2
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 683a3974f..5c3bf9c70 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -1,8 +1,8 @@
 from __future__ import unicode_literals
 
 import numpy
-import msgpack
 import gzip
+import srsly
 from thinc.neural.ops import NumpyOps
 
 from ..compat import copy_reg
@@ -74,11 +74,11 @@ class Binder(object):
             "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
             "strings": list(self.strings),
         }
-        return gzip.compress(msgpack.dumps(msg))
+        return gzip.compress(srsly.msgpack_dumps(msg))
 
     def from_bytes(self, string):
         """Deserialize the binder's annotations from a byte string."""
-        msg = msgpack.loads(gzip.decompress(string))
+        msg = srsly.msgpack_loads(gzip.decompress(string))
         self.attrs = msg["attrs"]
         self.strings = set(msg["strings"])
         lengths = numpy.fromstring(msg["lengths"], dtype="int32")
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index b845b4eb7..cd2428d79 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -10,8 +10,8 @@ import numpy
 import numpy.linalg
 import struct
 import dill
-import msgpack
 from thinc.neural.util import get_array_module, copy_array
+import srsly
 
 from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
@@ -28,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
 from ..attrs cimport ENT_TYPE, SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
-from ..util import normalize_slice, is_json_serializable
+from ..util import normalize_slice
 from ..compat import is_config, copy_reg, pickle, basestring_
 from ..errors import deprecation_warning, models_warning, user_warning
 from ..errors import Errors, Warnings
@@ -807,8 +807,8 @@ cdef class Doc:
         }
         if 'user_data' not in exclude and self.user_data:
             user_data_keys, user_data_values = list(zip(*self.user_data.items()))
-            serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys)
-            serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values)
+            serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys)
+            serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values)
 
         return util.to_bytes(serializers, exclude)
 
@@ -836,9 +836,8 @@ cdef class Doc:
         # keys, we must have tuples. In values we just have to hope
         # users don't mind getting a list instead of a tuple.
         if 'user_data' not in exclude and 'user_data_keys' in msg:
-            user_data_keys = msgpack.loads(msg['user_data_keys'],
-                                           use_list=False, raw=False)
-            user_data_values = msgpack.loads(msg['user_data_values'], raw=False)
+            user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False)
+            user_data_values = srsly.msgpack_loads(msg['user_data_values'])
             for key, value in zip(user_data_keys, user_data_values):
                 self.user_data[key] = value
 
@@ -996,7 +995,7 @@ cdef class Doc:
                 if not self.has_extension(attr):
                     raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
                 value = self._.get(attr)
-                if not is_json_serializable(value):
+                if not srsly.is_json_serializable(value):
                     raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
                 data['_'][attr] = value
         return data
diff --git a/spacy/util.py b/spacy/util.py
index d8c82da89..7e700be03 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals, print_function
 
 import os
-import ujson
 import pkg_resources
 import importlib
 import regex as re
@@ -15,18 +14,13 @@ import functools
 import cytoolz
 import itertools
 import numpy.random
-
+import srsly
 
 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, unicode_
-from .compat import import_file, json_dumps
+from .compat import import_file
 from .errors import Errors
 
-# Import these directly from Thinc, so that we're sure we always have the
-# same version.
-from thinc.neural._classes.model import msgpack  # noqa: F401
-from thinc.neural._classes.model import msgpack_numpy  # noqa: F401
-
 
 LANGUAGES = {}
 _data_path = Path(__file__).parent / "data"
@@ -185,7 +179,7 @@ def get_model_meta(path):
     meta_path = model_path / "meta.json"
     if not meta_path.is_file():
         raise IOError(Errors.E053.format(path=meta_path))
-    meta = read_json(meta_path)
+    meta = srsly.read_json(meta_path)
     for setting in ["lang", "name", "version"]:
         if setting not in meta or not meta[setting]:
             raise ValueError(Errors.E054.format(setting=setting))
@@ -529,74 +523,16 @@ def itershuffle(iterable, bufsize=1000):
         raise StopIteration
 
 
-def read_json(location):
-    """Open and load JSON from file.
-
-    location (Path): Path to JSON file.
-    RETURNS (dict): Loaded JSON content.
-    """
-    location = ensure_path(location)
-    with location.open("r", encoding="utf8") as f:
-        return ujson.load(f)
-
-
-def write_json(file_path, contents):
-    """Create a .json file and dump contents.
-
-    file_path (unicode / Path): The path to the output file.
-    contents: The JSON-serializable contents to output.
-    """
-    with Path(file_path).open("w", encoding="utf8") as f:
-        f.write(json_dumps(contents))
-
-
-def read_jsonl(file_path):
-    """Read a .jsonl file and yield its contents line by line.
-
-    file_path (unicode / Path): The file path.
-    YIELDS: The loaded JSON contents of each line.
-    """
-    with Path(file_path).open("r", encoding="utf8") as f:
-        for line in f:
-            try:  # hack to handle broken jsonl
-                yield ujson.loads(line.strip())
-            except ValueError:
-                continue
-
-
-def write_jsonl(file_path, lines):
-    """Create a .jsonl file and dump contents.
-
-    file_path (unicode / Path): The path to the output file.
-    lines (list): The JSON-serializable contents of each line.
-    """
-    data = [json_dumps(line) for line in lines]
-    with Path(file_path).open("w", encoding="utf-8") as f:
-        f.write("\n".join(data))
-
-
-def is_json_serializable(obj):
-    """Check if a Python object is JSON-serializable."""
-    if hasattr(obj, "__call__"):
-        # Check this separately here to prevent infinite recursions
-        return False
-    try:
-        ujson.dumps(obj)
-        return True
-    except TypeError:
-        return False
-
-
 def to_bytes(getters, exclude):
     serialized = OrderedDict()
     for key, getter in getters.items():
         if key not in exclude:
             serialized[key] = getter()
-    return msgpack.dumps(serialized, use_bin_type=True)
+    return srsly.msgpack_dumps(serialized)
 
 
 def from_bytes(bytes_data, setters, exclude):
-    msg = msgpack.loads(bytes_data, raw=False)
+    msg = srsly.msgpack_loads(bytes_data)
     for key, setter in setters.items():
         if key not in exclude and key in msg:
             setter(msg[key])
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 3e3268bfa..911eff08e 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -4,9 +4,7 @@ from __future__ import unicode_literals
 import functools
 import numpy
 from collections import OrderedDict
-
-from .util import msgpack
-from .util import msgpack_numpy
+import srsly
 
 cimport numpy as np
 from thinc.neural.util import get_array_module
@@ -353,7 +351,7 @@ cdef class Vectors:
             save_array = lambda arr, file_: xp.save(file_, arr)
         serializers = OrderedDict((
             ('vectors', lambda p: save_array(self.data, p.open('wb'))),
-            ('key2row', lambda p: msgpack.dump(self.key2row, p.open('wb')))
+            ('key2row', lambda p: srsly.write_msgpack(p, self.key2row))
         ))
         return util.to_disk(path, serializers, exclude)
 
@@ -366,8 +364,7 @@ cdef class Vectors:
         """
         def load_key2row(path):
             if path.exists():
-                with path.open('rb') as file_:
-                    self.key2row = msgpack.load(file_)
+                self.key2row = srsly.read_msgpack(path)
             for key, row in self.key2row.items():
                 if self._unset.count(row):
                     self._unset.erase(self._unset.find(row))
@@ -401,9 +398,9 @@ cdef class Vectors:
             if hasattr(self.data, 'to_bytes'):
                 return self.data.to_bytes()
             else:
-                return msgpack.dumps(self.data)
+                return srsly.msgpack_dumps(self.data)
         serializers = OrderedDict((
-            ('key2row', lambda: msgpack.dumps(self.key2row)),
+            ('key2row', lambda: srsly.msgpack_dumps(self.key2row)),
             ('vectors', serialize_weights)
         ))
         return util.to_bytes(serializers, exclude)
@@ -419,10 +416,10 @@ cdef class Vectors:
             if hasattr(self.data, 'from_bytes'):
                 self.data.from_bytes()
             else:
-                self.data = msgpack.loads(b)
+                self.data = srsly.msgpack_loads(b)
 
         deserializers = OrderedDict((
-            ('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
+            ('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))),
             ('vectors', deserialize_weights)
         ))
         util.from_bytes(data, deserializers, exclude)
diff --git a/website/api/_top-level/_compat.jade b/website/api/_top-level/_compat.jade
index c9b023647..7de2f4102 100644
--- a/website/api/_top-level/_compat.jade
+++ b/website/api/_top-level/_compat.jade
@@ -9,10 +9,9 @@ p
     |  underscore, e.e #[code unicode_].
 
 +aside-code("Example").
-    from spacy.compat import unicode_, json_dumps
+    from spacy.compat import unicode_
 
     compatible_unicode = unicode_('hello world')
-    compatible_json = json_dumps({'key': 'value'})
 
 +table(["Name", "Python 2", "Python 3"])
     +row
@@ -35,11 +34,6 @@ p
         +cell #[code raw_input]
         +cell #[code input]
 
-    +row
-        +cell #[code compat.json_dumps]
-        +cell #[code ujson.dumps] with #[code .decode('utf8')]
-        +cell #[code ujson.dumps]
-
     +row
         +cell #[code compat.path2str]
         +cell #[code str(path)] with #[code .decode('utf8')]

From 8e9a4d2f5e5e2a9ff37e669be1c2c918d89634cc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 01:59:50 +0100
Subject: [PATCH 07/27] Increment version to 2.1.0a5

---
 spacy/about.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/about.py b/spacy/about.py
index d21aad7e4..c00124889 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,13 +4,13 @@
 # fmt: off
 
 __title__ = "spacy-nightly"
-__version__ = "2.1.0a4"
+__version__ = "2.1.0a5"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
 __email__ = "contact@explosion.ai"
 __license__ = "MIT"
-__release__ = False
+__release__ = True
 
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

From a7b085ae46c24e8f4e6acd40f1039fc133797fac Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 02:03:26 +0100
Subject: [PATCH 08/27] Set version back to 2.1.0a4

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index c00124889..dfa292d62 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,7 +4,7 @@
 # fmt: off
 
 __title__ = "spacy-nightly"
-__version__ = "2.1.0a5"
+__version__ = "2.1.0a4"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"

From ee4733b48cd43300f55c038954f0d16d11569954 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 3 Dec 2018 02:10:37 +0100
Subject: [PATCH 09/27] Update srsly version pin

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d68ac7a31..cdab5aace 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ regex==2018.01.10
 requests>=2.13.0,<3.0.0
 jsonschema>=2.6.0,<3.0.0
 wasabi>=0.0.8,<1.1.0
-srsly>=0.0.4,<1.1.0
+srsly>=0.0.5,<1.1.0
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 cython>=0.25
diff --git a/setup.py b/setup.py
index 99ae655bb..0f4b501a4 100755
--- a/setup.py
+++ b/setup.py
@@ -208,7 +208,7 @@ def setup_package():
                 "requests>=2.13.0,<3.0.0",
                 "jsonschema>=2.6.0,<3.0.0",
                 "wasabi>=0.0.8,<1.1.0",
-                "srsly>=0.0.4,<1.1.0",
+                "srsly>=0.0.5,<1.1.0",
                 'pathlib==1.0.1; python_version < "3.4"',
             ],
             setup_requires=["wheel"],

From 5b2741f75133a8d2299f37091f4fd3e17539b01f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 3 Dec 2018 02:12:07 +0100
Subject: [PATCH 10/27] Remove unused cytoolz / itertools imports

---
 examples/training/conllu.py      | 1 -
 spacy/cli/converters/iob2json.py | 4 ++--
 spacy/cli/ud/ud_run_test.py      | 1 -
 spacy/cli/ud/ud_train.py         | 1 -
 spacy/gold.pyx                   | 2 --
 spacy/syntax/_parser_model.pyx   | 1 -
 6 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/examples/training/conllu.py b/examples/training/conllu.py
index f949a8156..a7745b93a 100644
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@@ -22,7 +22,6 @@ from spacy.matcher import Matcher
 import itertools
 import random
 import numpy.random
-import cytoolz
 
 import conll17_ud_eval
 
diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py
index 24e78989b..35dc29bb3 100644
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from cytoolz import partition_all
+import cytoolz
 
 from ...gold import iob_to_biluo
 
@@ -11,7 +11,7 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
     Convert IOB files into JSON format for use with train cli.
     """
     docs = []
-    for group in partition_all(n_sents, docs):
+    for group in cytoolz.partition_all(n_sents, docs):
         group = list(group)
         first = group.pop(0)
         to_extend = first["paragraphs"][0]["sentences"]
diff --git a/spacy/cli/ud/ud_run_test.py b/spacy/cli/ud/ud_run_test.py
index e3771fa92..43140eb03 100644
--- a/spacy/cli/ud/ud_run_test.py
+++ b/spacy/cli/ud/ud_run_test.py
@@ -30,7 +30,6 @@ Fused_inside = None
 import itertools
 import random
 import numpy.random
-import cytoolz
 
 from . import conll17_ud_eval
 
diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py
index 746607be0..6c4fbb3eb 100644
--- a/spacy/cli/ud/ud_train.py
+++ b/spacy/cli/ud/ud_train.py
@@ -25,7 +25,6 @@ from timeit import default_timer as timer
 import itertools
 import random
 import numpy.random
-import cytoolz
 
 from . import conll17_ud_eval
 
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 9c0c00652..8bdd42a83 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -4,8 +4,6 @@ from __future__ import unicode_literals, print_function
 
 import re
 import random
-import cytoolz
-import itertools
 import numpy
 import tempfile
 import shutil
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 9796193f6..f60354be8 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -7,7 +7,6 @@ from __future__ import unicode_literals, print_function
 from collections import OrderedDict
 import numpy
 cimport cython.parallel
-import cytoolz
 import numpy.random
 cimport numpy as np
 from libc.math cimport exp

From 1c71fdb8058595a4266802a48ed2ff8534ebabaf Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 02:19:12 +0100
Subject: [PATCH 11/27] Remove cytoolz usage from spaCy

---
 spacy/pipeline.pyx         | 9 ++++-----
 spacy/syntax/nn_parser.pyx | 5 ++---
 spacy/util.py              | 3 +--
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index c3b8f5fae..b1e046b5b 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -5,7 +5,6 @@ from __future__ import unicode_literals
 
 import numpy
 cimport numpy as np
-import cytoolz
 from collections import OrderedDict, defaultdict
 import srsly
 
@@ -302,7 +301,7 @@ class Pipe(object):
         Both __call__ and pipe should delegate to the `predict()`
         and `set_annotations()` methods.
         """
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
             scores, tensors = self.predict(docs)
             self.set_annotations(docs, scores, tensor=tensors)
@@ -479,7 +478,7 @@ class Tensorizer(Pipe):
         n_threads (int): Number of threads.
         YIELDS (iterator): A sequence of `Doc` objects, in order of input.
         """
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
             tensors = self.predict(docs)
             self.set_annotations(docs, tensors)
@@ -588,7 +587,7 @@ class Tagger(Pipe):
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
             tag_ids, tokvecs = self.predict(docs)
             self.set_annotations(docs, tag_ids, tensors=tokvecs)
@@ -1073,7 +1072,7 @@ class TextCategorizer(Pipe):
         return doc
 
     def pipe(self, stream, batch_size=128, n_threads=-1):
-        for docs in cytoolz.partition_all(batch_size, stream):
+        for docs in util.minibatch(stream, size=batch_size):
             docs = list(docs)
             scores, tensors = self.predict(docs)
             self.set_annotations(docs, scores, tensors=tensors)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 186c5c16c..61bbbc967 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -7,7 +7,6 @@ from __future__ import unicode_literals, print_function
 from collections import OrderedDict
 import numpy
 cimport cython.parallel
-import cytoolz
 import numpy.random
 cimport numpy as np
 from cpython.ref cimport PyObject, Py_XDECREF
@@ -213,10 +212,10 @@ cdef class Parser:
             beam_width = self.cfg.get('beam_width', 1)
         beam_density = self.cfg.get('beam_density', 0.)
         cdef Doc doc
-        for batch in cytoolz.partition_all(batch_size, docs):
+        for batch in util.minibatch(docs, size=batch_size):
             batch_in_order = list(batch)
             by_length = sorted(batch_in_order, key=lambda doc: len(doc))
-            for subbatch in cytoolz.partition_all(8, by_length):
+            for subbatch in util.minibatch(by_length, size=8):
                 subbatch = list(subbatch)
                 parse_states = self.predict(subbatch, beam_width=beam_width,
                                             beam_density=beam_density)
diff --git a/spacy/util.py b/spacy/util.py
index 7e700be03..0a682fcaa 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -11,7 +11,6 @@ from collections import OrderedDict
 from thinc.neural._classes.model import Model
 from thinc.neural.ops import NumpyOps
 import functools
-import cytoolz
 import itertools
 import numpy.random
 import srsly
@@ -403,7 +402,7 @@ def minibatch(items, size=8):
     items = iter(items)
     while True:
         batch_size = next(size_)
-        batch = list(cytoolz.take(int(batch_size), items))
+        batch = list(itertools.islice(items, int(batch_size)))
         if len(batch) == 0:
             break
         yield list(batch)

From 2402ef498be5fd5289dea87262c52ff56c1b8087 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 02:19:23 +0100
Subject: [PATCH 12/27] Remove unused import

---
 spacy/gold.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 9c0c00652..d7c5275f6 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -4,7 +4,6 @@ from __future__ import unicode_literals, print_function
 
 import re
 import random
-import cytoolz
 import itertools
 import numpy
 import tempfile

From db75c70550172ff9fc13d6645e42c6ffa2d599c0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 02:31:19 +0100
Subject: [PATCH 13/27] Remove dill dependency

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index cdab5aace..4704faa47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,6 @@ blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
-dill>=0.2,<0.3
 regex==2018.01.10
 requests>=2.13.0,<3.0.0
 jsonschema>=2.6.0,<3.0.0

From 5ed19fbee2e03b6dc9c3461850f6c4dad15c4134 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 02:37:22 +0100
Subject: [PATCH 14/27] Remove cytoolz dependency

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4704faa47..2979bdfc6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ preshed>=2.0.1,<2.1.0
 thinc==7.0.0.dev4
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
 regex==2018.01.10
 requests>=2.13.0,<3.0.0

From 3df26d820f0185d442a8030b287f5235749863cb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 02:41:05 +0100
Subject: [PATCH 15/27] Sort requirements

---
 requirements.txt | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 2979bdfc6..9df42b485 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,17 @@
-numpy>=1.15.0
+# Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
 thinc==7.0.0.dev4
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
-plac<1.0.0,>=0.9.6
-regex==2018.01.10
-requests>=2.13.0,<3.0.0
-jsonschema>=2.6.0,<3.0.0
 wasabi>=0.0.8,<1.1.0
 srsly>=0.0.5,<1.1.0
+# Third party dependencies
+numpy>=1.15.0
+requests>=2.13.0,<3.0.0
+jsonschema>=2.6.0,<3.0.0
+regex==2018.01.10
+plac<1.0.0,>=0.9.6
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 cython>=0.25

From ea00dbaaa4912e16f0d5e27d091e858a777f8fee Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 3 Dec 2018 02:43:03 +0100
Subject: [PATCH 16/27] Remove usage of itertools.islice

---
 spacy/syntax/nn_parser.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 61bbbc967..55e667752 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -9,6 +9,7 @@ import numpy
 cimport cython.parallel
 import numpy.random
 cimport numpy as np
+from itertools import islice
 from cpython.ref cimport PyObject, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from libc.math cimport exp
@@ -515,7 +516,7 @@ cdef class Parser:
                 sgd = self.create_optimizer()
             doc_sample = []
             gold_sample = []
-            for raw_text, annots_brackets in cytoolz.take(1000, get_gold_tuples()):
+            for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
                 for annots, brackets in annots_brackets:
                     ids, words, tags, heads, deps, ents = annots
                     doc_sample.append(Doc(self.vocab, words=words))

From 378ca4b46df9f88dfa359e994acab763b665d2f7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 4 Dec 2018 00:06:42 +0100
Subject: [PATCH 17/27] Fix OSX build problem

---
 setup.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 0f4b501a4..501dfe153 100755
--- a/setup.py
+++ b/setup.py
@@ -7,10 +7,27 @@ import sys
 import contextlib
 from distutils.command.build_ext import build_ext
 from distutils.sysconfig import get_python_inc
+import distutils.util
 from distutils import ccompiler, msvccompiler
 from setuptools import Extension, setup, find_packages
 
 
+def is_new_osx():
+    '''Check whether we're on OSX >= 10.10'''
+    name = distutils.util.get_platform()
+    if sys.platform != 'darwin':
+        return False
+    elif name.startswith('macosx-10'):
+        minor_version = int(name.split('-')[1].split('.')[1])
+        if minor_version >= 7:
+            return True
+        else:
+            return False
+    else:
+        return False
+
+
+
 PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens"]}
 
 
@@ -57,8 +74,17 @@ COMPILE_OPTIONS = {
 LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
 
 
-# I don't understand this very well yet. See Issue #267
-# Fingers crossed!
+if is_new_osx():
+    # On Mac, use libc++ because Apple deprecated use of
+    # libstdc
+    COMPILE_OPTIONS["other"].append("-stdlib=libc++")
+    LINK_OPTIONS["other"].append("-lc++")
+    # g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
+    # See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
+    LINK_OPTIONS["other"].append("-nodefaultlibs")
+
+
+
 USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None
 if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1":
     if sys.platform == "darwin":

From 8f6555df4e1596070d758a395dec2ef2dbd7b373 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 4 Dec 2018 00:07:28 +0100
Subject: [PATCH 18/27] Update requirements

---
 requirements.txt | 2 +-
 setup.py         | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9df42b485..8213bac7b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc==7.0.0.dev4
+thinc==7.0.0.dev6
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.0.8,<1.1.0
diff --git a/setup.py b/setup.py
index 501dfe153..90d2dfb6f 100755
--- a/setup.py
+++ b/setup.py
@@ -226,11 +226,10 @@ def setup_package():
                 "murmurhash>=0.28.0,<1.1.0",
                 "cymem>=2.0.2,<2.1.0",
                 "preshed>=2.0.1,<2.1.0",
-                "thinc==7.0.0.dev4",
+                "thinc==7.0.0.dev6",
                 "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
                 "regex==2018.01.10",
-                "dill>=0.2,<0.3",
                 "requests>=2.13.0,<3.0.0",
                 "jsonschema>=2.6.0,<3.0.0",
                 "wasabi>=0.0.8,<1.1.0",

From cabaadd793b785d4e3572f3851b5263345340394 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 15:12:39 +0100
Subject: [PATCH 19/27] Fix build error from bad import

Thinc v7.0.0.dev6 moved FeatureExtracter around and didn't add a compatibility import.
---
 spacy/_ml.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index b6bc1792b..3df9d72ba 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -8,8 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.t2v import Pooling, sum_pool
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
+from thinc.misc import FeatureExtracter
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
+from thinc.api import with_getitem, flatten_add_lengths
 from thinc.api import uniqued, wrap, noop
 from thinc.api import with_square_sequences
 from thinc.linear.linear import LinearModel

From 711f10853205321ee7f09419e4ae09d54059ce43 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 16:04:12 +0100
Subject: [PATCH 20/27] Fix cytoolz import cytoolz

---
 spacy/cli/profile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 439ef79a1..45e97b8ba 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -8,7 +8,7 @@ import cProfile
 import pstats
 import sys
 import tqdm
-import cytoolz
+import itertools
 import thinc.extra.datasets
 from wasabi import Printer
 
@@ -40,7 +40,7 @@ def profile(model, inputs=None, n_texts=10000):
     with msg.loading("Loading model '{}'...".format(model)):
         nlp = load_model(model)
     msg.good("Loaded model '{}'".format(model))
-    texts = list(cytoolz.take(n_texts, inputs))
+    texts = list(itertools.islice(inputs, n_texts))
     cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
     s = pstats.Stats("Profile.prof")
     msg.divider("Profile stats")

From 9520489225bc631401d509f03aa8a47722a95931 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 18:46:09 +0100
Subject: [PATCH 21/27] Fix removabl of dill (for srsly)

---
 spacy/tokens/doc.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index cd2428d79..42ecb5644 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -9,7 +9,7 @@ cimport numpy as np
 import numpy
 import numpy.linalg
 import struct
-import dill
+import srsly
 from thinc.neural.util import get_array_module, copy_array
 import srsly
 
@@ -1061,11 +1061,11 @@ def pickle_doc(doc):
     bytes_data = doc.to_bytes(vocab=False, user_data=False)
     hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks,
                       doc.user_token_hooks)
-    return (unpickle_doc, (doc.vocab, dill.dumps(hooks_and_data), bytes_data))
+    return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data))
 
 
 def unpickle_doc(vocab, hooks_and_data, bytes_data):
-    user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
+    user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data)
 
     doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
                                                      exclude='user_data')

From c0af627f32533b8b500ccc24dcbaa204bfc83356 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 18:53:16 +0100
Subject: [PATCH 22/27] Fix dill usage in vocab

---
 spacy/vocab.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 00467ecf1..42fd2f46e 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -3,7 +3,7 @@
 from __future__ import unicode_literals
 
 import numpy
-import dill
+import srsly
 
 from collections import OrderedDict
 from thinc.neural.util import get_array_module
@@ -513,7 +513,7 @@ def pickle_vocab(vocab):
     morph = vocab.morphology
     length = vocab.length
     data_dir = vocab.data_dir
-    lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
+    lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
     lexemes_data = vocab.lexemes_to_bytes()
     return (unpickle_vocab,
             (sstore, vectors, morph, data_dir, lex_attr_getters, lexemes_data, length))
@@ -527,7 +527,7 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
     vocab.strings = sstore
     vocab.morphology = morphology
     vocab.data_dir = data_dir
-    vocab.lex_attr_getters = dill.loads(lex_attr_getters)
+    vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
     vocab.lexemes_from_bytes(lexemes_data)
     vocab.length = length
     return vocab

From 0a6072621570464522cbfa6d939dffccc0fa6503 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 20:37:00 +0100
Subject: [PATCH 23/27] Remove cytoolz usage in CLI

---
 spacy/cli/converters/iob2json.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py
index 35dc29bb3..7864da66b 100644
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@@ -1,9 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import cytoolz
-
 from ...gold import iob_to_biluo
+from ...util import minibatch
 
 
 def iob2json(input_data, n_sents=10, *args, **kwargs):
@@ -11,7 +10,7 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
     Convert IOB files into JSON format for use with train cli.
     """
     docs = []
-    for group in cytoolz.partition_all(n_sents, docs):
+    for group in minibatch(docs, n_sents):
         group = list(group)
         first = group.pop(0)
         to_extend = first["paragraphs"][0]["sentences"]

From e619f452877c54c2801415abaf0f93ff3889adb4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 20:43:47 +0100
Subject: [PATCH 24/27] Fix pickle tests

---
 spacy/tests/test_pickles.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py
index 0ae829753..ad2c33e95 100644
--- a/spacy/tests/test_pickles.py
+++ b/spacy/tests/test_pickles.py
@@ -2,8 +2,8 @@
 from __future__ import unicode_literals
 
 import pytest
-import dill as pickle
 import numpy
+import srsly
 from spacy.strings import StringStore
 from spacy.vocab import Vocab
 from spacy.attrs import NORM
@@ -14,8 +14,8 @@ def test_pickle_string_store(text1, text2):
     stringstore = StringStore()
     store1 = stringstore[text1]
     store2 = stringstore[text2]
-    data = pickle.dumps(stringstore, protocol=-1)
-    unpickled = pickle.loads(data)
+    data = srsly.pickle_dumps(stringstore, protocol=-1)
+    unpickled = srsly.pickle_loads(data)
     assert unpickled[text1] == store1
     assert unpickled[text2] == store2
     assert len(stringstore) == len(unpickled)

From bb3304a4f1785cca954858b500afeef9051d245e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 20:46:36 +0100
Subject: [PATCH 25/27] Fix pickle tests

---
 spacy/tests/test_pickles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py
index ad2c33e95..65288527a 100644
--- a/spacy/tests/test_pickles.py
+++ b/spacy/tests/test_pickles.py
@@ -29,8 +29,8 @@ def test_pickle_vocab(text1, text2):
     lex2 = vocab[text2]
     assert lex1.norm_ == text1[:-1]
     assert lex2.norm_ == text2[:-1]
-    data = pickle.dumps(vocab)
-    unpickled = pickle.loads(data)
+    data = srsly.pickle_dumps(vocab)
+    unpickled = srsly.pickle_loads(data)
     assert unpickled[text1].orth == lex1.orth
     assert unpickled[text2].orth == lex2.orth
     assert unpickled[text1].norm == lex1.norm

From d896fbca62716b02ae1980ad9d0e69283dfa929c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 21:45:56 +0100
Subject: [PATCH 26/27] Fix batch size in parser.pipe

---
 spacy/syntax/nn_parser.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 55e667752..a8809b4e6 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -216,7 +216,7 @@ cdef class Parser:
         for batch in util.minibatch(docs, size=batch_size):
             batch_in_order = list(batch)
             by_length = sorted(batch_in_order, key=lambda doc: len(doc))
-            for subbatch in util.minibatch(by_length, size=8):
+            for subbatch in util.minibatch(by_length, size=batch_size//4):
                 subbatch = list(subbatch)
                 parse_states = self.predict(subbatch, beam_width=beam_width,
                                             beam_density=beam_density)

From 427c0693c8cd4ee48731f5df25b7302adc09f8b4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 6 Dec 2018 22:48:31 +0100
Subject: [PATCH 27/27] Fix missing comma in init-model command

---
 spacy/cli/init_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 8dc2a8cf2..0a8570a7b 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -34,7 +34,7 @@ msg = Printer()
     freqs_loc=("Location of words frequencies file", "option", "f", Path),
     jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
     clusters_loc=("Optional location of brown clusters data", "option", "c", str),
-    vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
+    vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
     prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
 )
 def init_model(