💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist  - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2025-07-24 23:19:45 +03:00 · 2018-11-30 20:16:14 +01:00 · 2018-11-30 20:16:14 +01:00 · 37c7c85a86
commit 37c7c85a86
parent 0369db75c1
46 changed files with 2476 additions and 1539 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,8 @@ ujson>=1.35
 dill>=0.2,<0.3
 regex==2018.01.10
 requests>=2.13.0,<3.0.0
+jsonschema>=2.6.0,<3.0.0
+wasabi>=0.0.8,<1.1.0
 pathlib==1.0.1; python_version < "3.4"
 # Development dependencies
 pytest>=4.0.0,<5.0.0
--- a/setup.py
+++ b/setup.py
@ -207,6 +207,8 @@ def setup_package():
                "regex==2018.01.10",
                "dill>=0.2,<0.3",
                "requests>=2.13.0,<3.0.0",
+                "jsonschema>=2.6.0,<3.0.0",
+                "wasabi>=0.0.8,<1.1.0",
                'pathlib==1.0.1; python_version < "3.4"',
            ],
            setup_requires=["wheel"],
--- a/spacy/main.py
+++ b/spacy/main.py
@ -1,40 +1,41 @@
 # coding: utf8
 from __future__ import print_function
+
 # NB! This breaks in plac on Python 2!!
 # from __future__ import unicode_literals

-if __name__ == '__main__':
+if __name__ == "__main__":
    import plac
    import sys
+    from wasabi import Printer
    from spacy.cli import download, link, info, package, train, pretrain, convert
-    from spacy.cli import vocab, init_model, profile, evaluate, validate
-    from spacy.cli import ud_train, ud_evaluate
-    from spacy.util import prints
+    from spacy.cli import init_model, profile, evaluate, validate
+    from spacy.cli import ud_train, ud_evaluate, debug_data
+
+    msg = Printer()

    commands = {
-        'download': download,
-        'link': link,
-        'info': info,
-        'train': train,
-        'pretrain': pretrain,
-        'ud-train': ud_train,
-        'evaluate': evaluate,
-        'ud-evaluate': ud_evaluate,
-        'convert': convert,
-        'package': package,
-        'vocab': vocab,
-        'init-model': init_model,
-        'profile': profile,
-        'validate': validate
+        "download": download,
+        "link": link,
+        "info": info,
+        "train": train,
+        "pretrain": pretrain,
+        "debug-data": debug_data,
+        "ud-train": ud_train,
+        "evaluate": evaluate,
+        "ud-evaluate": ud_evaluate,
+        "convert": convert,
+        "package": package,
+        "init-model": init_model,
+        "profile": profile,
+        "validate": validate,
    }
    if len(sys.argv) == 1:
-        prints(', '.join(commands), title="Available commands", exits=1)
+        msg.info("Available commands", ", ".join(commands), exits=1)
    command = sys.argv.pop(1)
-    sys.argv[0] = 'spacy %s' % command
+    sys.argv[0] = "spacy %s" % command
    if command in commands:
        plac.call(commands[command], sys.argv[1:])
    else:
-        prints(
-            "Available: %s" % ', '.join(commands),
-            title="Unknown command: %s" % command,
-            exits=1)
+        available = "Available: {}".format(", ".join(commands))
+        msg.fail("Unknown command: {}".format(command), available, exits=1)
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -1,14 +1,13 @@
-from .download import download
-from .info import info
-from .link import link
-from .package import package
-from .profile import profile
-from .train import train
-from .pretrain import pretrain
-from .evaluate import evaluate
-from .convert import convert
-from .vocab import make_vocab as vocab
-from .init_model import init_model
-from .validate import validate
-from .ud_train import main as ud_train
-from .conll17_ud_eval import main as ud_evaluate
+from .download import download  # noqa: F401
+from .info import info  # noqa: F401
+from .link import link  # noqa: F401
+from .package import package  # noqa: F401
+from .profile import profile  # noqa: F401
+from .train import train  # noqa: F401
+from .pretrain import pretrain  # noqa: F401
+from .debug_data import debug_data  # noqa: F401
+from .evaluate import evaluate  # noqa: F401
+from .convert import convert  # noqa: F401
+from .init_model import init_model  # noqa: F401
+from .validate import validate  # noqa: F401
+from .ud import ud_train, ud_evaluate  # noqa: F401
--- a/spacy/cli/_messages.py
+++ b/spacy/cli/_messages.py
@ -2,6 +2,8 @@
 from __future__ import unicode_literals


+# fmt: off
+
 class Messages(object):
    M001 = ("Download successful but linking failed")
    M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
@ -73,3 +75,31 @@ class Messages(object):
    M052 = ("Not a valid meta.json format")
    M053 = ("Expected dict but got: {meta_type}")
    M054 = ("No --lang specified, but tokenization required.")
+    M055 = ("Training pipeline: {pipeline}")
+    M056 = ("Starting with base model '{model}'")
+    M057 = ("Starting with blank model '{model}'")
+    M058 = ("Loading vector from model '{model}'")
+    M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
+    M060 = ("Counting training words (limit={limit})")
+    M061 = ("\nSaving model...")
+    M062 = ("Output directory is not empty.")
+    M063 = ("Incompatible arguments")
+    M064 = ("The -f and -c arguments are deprecated, and not compatible with "
+            "the -j argument, which should specify the same information. "
+            "Either merge the frequencies and clusters data into the "
+            "JSONL-formatted file (recommended), or use only the -f and -c "
+            "files, without the other lexical attributes.")
+    M065 = ("This can lead to unintended side effects when saving the model. "
+            "Please use an empty directory or a different path instead. If "
+            "the specified output path doesn't exist, the directory will be "
+            "created for you.")
+    M066 = ("Saved model to output directory")
+    M067 = ("Can't find lexical data")
+    M068 = ("Sucessfully compiled vocab and vectors, and saved model")
+    M069 = ("Unknown file type: '{name}'")
+    M070 = ("Supported file types: '{options}'")
+    M071 = ("Loaded pretrained tok2vec for: {components}")
+    M072 = ("Model language ('{model_lang}') doesn't match language specified "
+            "as `lang` argument ('{lang}') ")
+
+# fmt: on
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -3,49 +3,91 @@ from __future__ import unicode_literals

 import plac
 from pathlib import Path
+from wasabi import Printer

+from ..util import write_jsonl, write_json
+from ..compat import json_dumps, path2str
 from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
 from .converters import ner_jsonl2json
 from ._messages import Messages
-from ..util import prints
+

 # Converters are matched by file extension. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
 CONVERTERS = {
-    'conllubio': conllubio2json,
-    'conllu': conllu2json,
-    'conll': conllu2json,
-    'ner': conll_ner2json,
-    'iob': iob2json,
-    'jsonl': ner_jsonl2json
+    "conllubio": conllubio2json,
+    "conllu": conllu2json,
+    "conll": conllu2json,
+    "ner": conll_ner2json,
+    "iob": iob2json,
+    "jsonl": ner_jsonl2json,
 }

+# File types
+FILE_TYPES = ("json", "jsonl")
+

@plac.annotations(
-    input_file=("input file", "positional", None, str),
-    output_dir=("output directory for converted file", "positional", None, str),
+    input_file=("Input file", "positional", None, str),
+    output_dir=("Output directory for converted file", "positional", None, str),
+    file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
    n_sents=("Number of sentences per doc", "option", "n", int),
    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
    lang=("Language (if tokenizer required)", "option", "l", str),
-    morphology=("Enable appending morphology to tags", "flag", "m", bool))
-def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
-        lang=None):
+    morphology=("Enable appending morphology to tags", "flag", "m", bool),
+)
+def convert(
+    input_file,
+    output_dir="-",
+    file_type="jsonl",
+    n_sents=1,
+    morphology=False,
+    converter="auto",
+    lang=None,
+):
    """
    Convert files into JSON format for use with train command and other
-    experiment management functions.
+    experiment management functions. If no output_dir is specified, the data
+    is written to stdout, so you can pipe them forward to a JSONL file:
+    $ spacy convert some_file.conllu > some_file.jsonl
    """
+    msg = Printer()
    input_path = Path(input_file)
-    output_path = Path(output_dir)
+    if file_type not in FILE_TYPES:
+        msg.fail(
+            Messages.M069.format(name=file_type),
+            Messages.M070.format(options=", ".join(FILE_TYPES)),
+            exits=1,
+        )
    if not input_path.exists():
-        prints(input_path, title=Messages.M028, exits=1)
-    if not output_path.exists():
-        prints(output_path, title=Messages.M029, exits=1)
-    if converter == 'auto':
+        msg.fail(Messages.M028, input_path, exits=1)
+    if output_dir != "-" and not Path(output_dir).exists():
+        msg.fail(Messages.M029, output_dir, exits=1)
+    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
-            prints(Messages.M031.format(converter=converter),
-                   title=Messages.M030, exits=1)
+        msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
+    # Use converter function to convert data
    func = CONVERTERS[converter]
-    func(input_path, output_path,
-         n_sents=n_sents, use_morphology=morphology, lang=lang)
+    input_data = input_path.open("r", encoding="utf-8").read()
+    data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
+    if output_dir != "-":
+        # Export data to a file
+        suffix = ".{}".format(file_type)
+        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
+        if file_type == "json":
+            write_json(output_file, data)
+        elif file_type == "jsonl":
+            write_jsonl(output_file, data)
+        msg.good(
+            Messages.M032.format(name=path2str(output_file)),
+            Messages.M033.format(n_docs=len(data)),
+        )
+    else:
+        # Print to stdout
+        if file_type == "json":
+            print(json_dumps(data))
+        elif file_type == "jsonl":
+            for line in data:
+                print(json_dumps(line))
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,5 +1,5 @@
-from .conllu2json import conllu2json
-from .conllubio2json import conllubio2json
-from .iob2json import iob2json
-from .conll_ner2json import conll_ner2json
-from .jsonl2json import ner_jsonl2json
+from .conllu2json import conllu2json  # noqa: F401
+from .conllubio2json import conllubio2json  # noqa: F401
+from .iob2json import iob2json  # noqa: F401
+from .conll_ner2json import conll_ner2json  # noqa: F401
+from .jsonl2json import ner_jsonl2json  # noqa: F401
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -1,52 +1,38 @@
 # coding: utf8
 from __future__ import unicode_literals

-from .._messages import Messages
-from ...compat import json_dumps, path2str
-from ...util import prints
 from ...gold import iob_to_biluo


-def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
+def conll_ner2json(input_data, **kwargs):
    """
    Convert files in the CoNLL-2003 NER format into JSON format for use with
    train cli.
    """
-    docs = read_conll_ner(input_path)
-
-    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
-    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
-    output_file = output_path / output_filename
-    with output_file.open('w', encoding='utf-8') as f:
-        f.write(json_dumps(docs))
-    prints(Messages.M033.format(n_docs=len(docs)),
-           title=Messages.M032.format(name=path2str(output_file)))
-
-
-def read_conll_ner(input_path):
-    text = input_path.open('r', encoding='utf-8').read()
-    i = 0
-    delimit_docs = '-DOCSTART- -X- O O'
+    delimit_docs = "-DOCSTART- -X- O O"
    output_docs = []
-    for doc in text.strip().split(delimit_docs):
+    for doc in input_data.strip().split(delimit_docs):
        doc = doc.strip()
        if not doc:
            continue
        output_doc = []
-        for sent in doc.split('\n\n'):
+        for sent in doc.split("\n\n"):
            sent = sent.strip()
            if not sent:
                continue
-            lines = [line.strip() for line in sent.split('\n') if line.strip()]
+            lines = [line.strip() for line in sent.split("\n") if line.strip()]
            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
            biluo_ents = iob_to_biluo(iob_ents)
-            output_doc.append({'tokens': [
-                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
-                zip(words, tags, biluo_ents)
-            ]})
-        output_docs.append({
-            'id': len(output_docs),
-            'paragraphs': [{'sentences': output_doc}]
-        })
+            output_doc.append(
+                {
+                    "tokens": [
+                        {"orth": w, "tag": tag, "ner": ent}
+                        for (w, tag, ent) in zip(words, tags, biluo_ents)
+                    ]
+                }
+            )
+        output_docs.append(
+            {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
+        )
        output_doc = []
    return output_docs
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -1,34 +1,27 @@
 # coding: utf8
 from __future__ import unicode_literals

-from .._messages import Messages
-from ...compat import json_dumps, path2str
-from ...util import prints
-from ...gold import iob_to_biluo
 import re

+from ...gold import iob_to_biluo

-def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):

+def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
    """
    Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.
-    """
-    # by @dvsrepo, via #11 explosion/spacy-dev-resources

-    """     
    Extract NER tags if available and convert them so that they follow
    BILUO and the Wikipedia scheme
    """
+    # by @dvsrepo, via #11 explosion/spacy-dev-resources
    # by @katarkor
-
    docs = []
    sentences = []
-    conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
+    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
    checked_for_ner = False
    has_ner_tags = False
-
    for i, (raw_text, tokens) in enumerate(conll_tuples):
        sentence, brackets = tokens[0]
        if not checked_for_ner:
@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=
        sentences.append(generate_sentence(sentence, has_ner_tags))
        # Real-sized documents could be extracted using the comments on the
        # conluu document
-
-        if(len(sentences) % n_sents == 0):
+        if len(sentences) % n_sents == 0:
            doc = create_doc(sentences, i)
            docs.append(doc)
            sentences = []
-
-    output_filename = input_path.parts[-1].replace(".conll", ".json")
-    output_filename = input_path.parts[-1].replace(".conllu", ".json")
-    output_file = output_path / output_filename
-    with output_file.open('w', encoding='utf-8') as f:
-        f.write(json_dumps(docs))
-    prints(Messages.M033.format(n_docs=len(docs)),
-           title=Messages.M032.format(name=path2str(output_file)))
+    return docs


 def is_ner(tag):
-
-    """ 
-    Check the 10th column of the first token to determine if the file contains
-    NER tags 
    """
-
-    tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
+    Check the 10th column of the first token to determine if the file contains
+    NER tags
+    """
+    tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
    if tag_match:
        return True
    elif tag == "O":
@ -67,29 +50,29 @@ def is_ner(tag):
    else:
        return False

-def read_conllx(input_path, use_morphology=False, n=0):
-    text = input_path.open('r', encoding='utf-8').read()
+
+def read_conllx(input_data, use_morphology=False, n=0):
    i = 0
-    for sent in text.strip().split('\n\n'):
-        lines = sent.strip().split('\n')
+    for sent in input_data.strip().split("\n\n"):
+        lines = sent.strip().split("\n")
        if lines:
-            while lines[0].startswith('#'):
+            while lines[0].startswith("#"):
                lines.pop(0)
            tokens = []
            for line in lines:

-                parts = line.split('\t')
+                parts = line.split("\t")
                id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
-                if '-' in id_ or '.' in id_:
+                if "-" in id_ or "." in id_:
                    continue
                try:
                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head != '0' else id_
-                    dep = 'ROOT' if dep == 'root' else dep
-                    tag = pos if tag == '_' else tag
-                    tag = tag+'__'+morph  if use_morphology else tag
+                    head = (int(head) - 1) if head != "0" else id_
+                    dep = "ROOT" if dep == "root" else dep
+                    tag = pos if tag == "_" else tag
+                    tag = tag + "__" + morph if use_morphology else tag
                    tokens.append((id_, word, tag, head, dep, iob))
-                except:
+                except:  # noqa: E722
                    print(line)
                    raise
            tuples = [list(t) for t in zip(*tokens)]
@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0):
            if n >= 1 and i >= n:
                break

+
 def simplify_tags(iob):
-   
    """
    Simplify tags obtained from the dataset in order to follow Wikipedia
    scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
    'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
-    'MISC'.     
+    'MISC'.
    """
-
    new_iob = []
    for tag in iob:
-        tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
+        tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
        if tag_match:
            prefix = tag_match.group(1)
            suffix = tag_match.group(2)
-            if suffix == 'GPE_LOC':
-                suffix = 'LOC'
-            elif suffix == 'GPE_ORG':
-                suffix = 'ORG'
-            elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG':
-                suffix = 'MISC'
-            tag = prefix + '-' + suffix
+            if suffix == "GPE_LOC":
+                suffix = "LOC"
+            elif suffix == "GPE_ORG":
+                suffix = "ORG"
+            elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
+                suffix = "MISC"
+            tag = prefix + "-" + suffix
        new_iob.append(tag)
    return new_iob

+
 def generate_sentence(sent, has_ner_tags):
    (id_, word, tag, head, dep, iob) = sent
    sentence = {}
@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags):
    return sentence


-def create_doc(sentences,id):
+def create_doc(sentences, id):
    doc = {}
    paragraph = {}
    doc["id"] = id
--- a/spacy/cli/converters/conllubio2json.py
+++ b/spacy/cli/converters/conllubio2json.py
@ -1,65 +1,54 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...compat import json_dumps, path2str
-from ...util import prints
 from ...gold import iob_to_biluo

-def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
+
+def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None):
    """
    Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.
    """
    # by @dvsrepo, via #11 explosion/spacy-dev-resources
-
    docs = []
    sentences = []
-    conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
-
+    conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
    for i, (raw_text, tokens) in enumerate(conll_tuples):
        sentence, brackets = tokens[0]
        sentences.append(generate_sentence(sentence))
        # Real-sized documents could be extracted using the comments on the
        # conluu document
-        if(len(sentences) % n_sents == 0):
+        if len(sentences) % n_sents == 0:
            doc = create_doc(sentences, i)
            docs.append(doc)
            sentences = []
-
-    output_filename = input_path.parts[-1].replace(".conll", ".json")
-    output_filename = input_path.parts[-1].replace(".conllu", ".json")
-    output_file = output_path / output_filename
-    with output_file.open('w', encoding='utf-8') as f:
-        f.write(json_dumps(docs))
-    prints("Created %d documents" % len(docs),
-           title="Generated output file %s" % path2str(output_file))
+    return docs


-def read_conllx(input_path, use_morphology=False, n=0):
-    text = input_path.open('r', encoding='utf-8').read()
+def read_conllx(input_data, use_morphology=False, n=0):
    i = 0
-    for sent in text.strip().split('\n\n'):
-        lines = sent.strip().split('\n')
+    for sent in input_data.strip().split("\n\n"):
+        lines = sent.strip().split("\n")
        if lines:
-            while lines[0].startswith('#'):
+            while lines[0].startswith("#"):
                lines.pop(0)
            tokens = []
            for line in lines:

-                parts = line.split('\t')
+                parts = line.split("\t")
                id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
-                if '-' in id_ or '.' in id_:
+                if "-" in id_ or "." in id_:
                    continue
                try:
                    id_ = int(id_) - 1
-                    head = (int(head) - 1) if head != '0' else id_
-                    dep = 'ROOT' if dep == 'root' else dep
-                    tag = pos if tag == '_' else tag
-                    tag = tag+'__'+morph  if use_morphology else tag
-                    ner = ner if ner else 'O'
+                    head = (int(head) - 1) if head != "0" else id_
+                    dep = "ROOT" if dep == "root" else dep
+                    tag = pos if tag == "_" else tag
+                    tag = tag + "__" + morph if use_morphology else tag
+                    ner = ner if ner else "O"
                    tokens.append((id_, word, tag, head, dep, ner))
-                except:
+                except:  # noqa: E722
                    print(line)
                    raise
            tuples = [list(t) for t in zip(*tokens)]
@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
            if n >= 1 and i >= n:
                break

+
 def generate_sentence(sent):
    (id_, word, tag, head, dep, ner) = sent
    sentence = {}
@ -85,7 +75,7 @@ def generate_sentence(sent):
    return sentence


-def create_doc(sentences,id):
+def create_doc(sentences, id):
    doc = {}
    paragraph = {}
    doc["id"] = id
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,26 +1,24 @@
 # coding: utf8
 from __future__ import unicode_literals
-from cytoolz import partition_all, concat

-from .._messages import Messages
-from ...compat import json_dumps, path2str
-from ...util import prints
+from cytoolz import partition_all
+
 from ...gold import iob_to_biluo


-def iob2json(input_path, output_path, n_sents=10, *a, **k):
+def iob2json(input_data, n_sents=10, *args, **kwargs):
    """
    Convert IOB files into JSON format for use with train cli.
    """
-    with input_path.open('r', encoding='utf8') as file_:
-        sentences = read_iob(file_)
-    docs = merge_sentences(sentences, n_sents)
-    output_filename = input_path.parts[-1].replace(".iob", ".json")
-    output_file = output_path / output_filename
-    with output_file.open('w', encoding='utf-8') as f:
-        f.write(json_dumps(docs))
-    prints(Messages.M033.format(n_docs=len(docs)),
-           title=Messages.M032.format(name=path2str(output_file)))
+    docs = []
+    for group in partition_all(n_sents, docs):
+        group = list(group)
+        first = group.pop(0)
+        to_extend = first["paragraphs"][0]["sentences"]
+        for sent in group[1:]:
+            to_extend.extend(sent["paragraphs"][0]["sentences"])
+        docs.append(first)
+    return docs


 def read_iob(raw_sents):
@ -28,30 +26,20 @@ def read_iob(raw_sents):
    for line in raw_sents:
        if not line.strip():
            continue
-        tokens = [t.split('|') for t in line.split()]
+        tokens = [t.split("|") for t in line.split()]
        if len(tokens[0]) == 3:
            words, pos, iob = zip(*tokens)
        else:
            words, iob = zip(*tokens)
-            pos = ['-'] * len(words)
+            pos = ["-"] * len(words)
        biluo = iob_to_biluo(iob)
-        sentences.append([
-            {'orth': w, 'tag': p, 'ner': ent}
-            for (w, p, ent) in zip(words, pos, biluo)
-        ])
-    sentences = [{'tokens': sent} for sent in sentences]
-    paragraphs = [{'sentences': [sent]} for sent in sentences]
-    docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
+        sentences.append(
+            [
+                {"orth": w, "tag": p, "ner": ent}
+                for (w, p, ent) in zip(words, pos, biluo)
+            ]
+        )
+    sentences = [{"tokens": sent} for sent in sentences]
+    paragraphs = [{"sentences": [sent]} for sent in sentences]
+    docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
    return docs
-
-def merge_sentences(docs, n_sents):
-    counter = 0
-    merged = []
-    for group in partition_all(n_sents, docs):
-        group = list(group)
-        first = group.pop(0)
-        to_extend = first['paragraphs'][0]['sentences']
-        for sent in group[1:]:
-            to_extend.extend(sent['paragraphs'][0]['sentences'])
-        merged.append(first)
-    return merged
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -1,33 +1,21 @@
 # coding: utf8
 from __future__ import unicode_literals
-import ujson as json

+import ujson
+
+from ...util import get_lang_class
 from .._messages import Messages
-from ...compat import json_dumps, path2str
-from ...util import prints, get_lang_class
-from ...gold import docs_to_json


-def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
+def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
    if lang is None:
-        prints(Messages.M054, exits=True)
+        raise ValueError(Messages.M054)
    json_docs = []
-    input_tuples = list(read_jsonl(input_path))
+    input_tuples = [ujson.loads(line) for line in input_data]
    nlp = get_lang_class(lang)()
    for i, (raw_text, ents) in enumerate(input_tuples):
        doc = nlp.make_doc(raw_text)
        doc[0].is_sent_start = True
-        doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
-        json_docs.append(docs_to_json(i, [doc]))
-
-    output_filename = input_path.parts[-1].replace(".jsonl", ".json")
-    output_loc = output_path / output_filename
-    with (output_loc).open('w', encoding='utf8') as file_:
-        file_.write(json_dumps(json_docs))
-    prints(Messages.M033.format(n_docs=len(json_docs)),
-           title=Messages.M032.format(name=path2str(output_loc)))
-
-def read_jsonl(input_path):
-    with input_path.open('r', encoding='utf8') as file_:
-        for line in file_:
-            yield json.loads(line)
+        doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
+        json_docs.append(doc.to_json())
+    return json_docs
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -0,0 +1,398 @@
+# coding: utf8
+from __future__ import unicode_literals, print_function
+
+from pathlib import Path
+from collections import Counter
+import plac
+import sys
+from wasabi import Printer, MESSAGES
+
+from ..gold import GoldCorpus, read_json_object
+from ..util import load_model, get_lang_class, read_json, read_jsonl
+
+# from .schemas import get_schema, validate_json
+from ._messages import Messages
+
+
+# Minimum number of expected occurences of label in data to train new label
+NEW_LABEL_THRESHOLD = 50
+# Minimum number of expected examples to train a blank model
+BLANK_MODEL_MIN_THRESHOLD = 100
+BLANK_MODEL_THRESHOLD = 2000
+
+
+@plac.annotations(
+    lang=("model language", "positional", None, str),
+    train_path=("location of JSON-formatted training data", "positional", None, Path),
+    dev_path=("location of JSON-formatted development data", "positional", None, Path),
+    base_model=("name of model to update (optional)", "option", "b", str),
+    pipeline=(
+        "Comma-separated names of pipeline components to train",
+        "option",
+        "p",
+        str,
+    ),
+    ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
+    ignore_validation=(
+        "Don't exit if JSON format validation fails",
+        "flag",
+        "IV",
+        bool,
+    ),
+    verbose=("Print additional information and explanations", "flag", "V", bool),
+    no_format=("Don't pretty-print the results", "flag", "NF", bool),
+)
+def debug_data(
+    lang,
+    train_path,
+    dev_path,
+    base_model=None,
+    pipeline="tagger,parser,ner",
+    ignore_warnings=False,
+    ignore_validation=False,
+    verbose=False,
+    no_format=False,
+):
+    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
+
+    # Make sure all files and paths exists if they are needed
+    if not train_path.exists():
+        msg.fail(Messages.M050, train_path, exits=1)
+    if not dev_path.exists():
+        msg.fail(Messages.M051, dev_path, exits=1)
+
+    # Initialize the model and pipeline
+    pipeline = [p.strip() for p in pipeline.split(",")]
+    if base_model:
+        nlp = load_model(base_model)
+    else:
+        lang_cls = get_lang_class(lang)
+        nlp = lang_cls()
+
+    msg.divider("Data format validation")
+    # Load the data in one – might take a while but okay in this case
+    with msg.loading("Loading {}...".format(train_path.parts[-1])):
+        train_data = _load_file(train_path, msg)
+    with msg.loading("Loading {}...".format(dev_path.parts[-1])):
+        dev_data = _load_file(dev_path, msg)
+
+    # Validate data format using the JSON schema
+    # TODO: update once the new format is ready
+    # schema = get_schema("training")
+    train_data_errors = []  # TODO: validate_json(train_data, schema)
+    dev_data_errors = []  # TODO: validate_json(dev_data, schema)
+    if not train_data_errors:
+        msg.good("Training data JSON format is valid")
+    if not dev_data_errors:
+        msg.good("Development data JSON format is valid")
+    for error in train_data_errors:
+        msg.fail("Training data: {}".format(error))
+    for error in dev_data_errors:
+        msg.fail("Develoment data: {}".format(error))
+    if (train_data_errors or dev_data_errors) and not ignore_validation:
+        sys.exit(1)
+
+    # Create the gold corpus to be able to better analyze data
+    with msg.loading("Analyzing corpus..."):
+        train_data = read_json_object(train_data)
+        dev_data = read_json_object(dev_data)
+        corpus = GoldCorpus(train_data, dev_data)
+        train_docs = list(corpus.train_docs(nlp))
+        dev_docs = list(corpus.dev_docs(nlp))
+    msg.good("Corpus is loadable")
+
+    # Create all gold data here to avoid iterating over the train_docs constantly
+    gold_data = _compile_gold(train_docs, pipeline)
+    train_texts = gold_data["texts"]
+    dev_texts = set([doc.text for doc, gold in dev_docs])
+
+    msg.divider("Training stats")
+    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
+    for pipe in [p for p in pipeline if p not in nlp.factories]:
+        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
+    if base_model:
+        msg.text("Starting with base model '{}'".format(base_model))
+    else:
+        msg.text("Starting with blank model '{}'".format(lang))
+    msg.text("{} training docs".format(len(train_docs)))
+    msg.text("{} evaluation docs".format(len(dev_docs)))
+
+    overlap = len(train_texts.intersection(dev_texts))
+    if overlap:
+        msg.warn("{} training examples also in evaluation data".format(overlap))
+    else:
+        msg.good("No overlap between training and evaluation data")
+    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
+        text = "Low number of examples to train from a blank model ({})".format(
+            len(train_docs)
+        )
+        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
+            msg.fail(text)
+        else:
+            msg.warn(text)
+        msg.text(
+            "It's recommended to use at least {} examples (minimum {})".format(
+                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
+            ),
+            show=verbose,
+        )
+
+    msg.divider("Vocab & Vectors")
+    n_words = gold_data["n_words"]
+    msg.info(
+        "{} total {} in the data ({} unique)".format(
+            n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
+        )
+    )
+    most_common_words = gold_data["words"].most_common(10)
+    msg.text(
+        "10 most common words: {}".format(
+            _format_labels(most_common_words, counts=True)
+        ),
+        show=verbose,
+    )
+    if len(nlp.vocab.vectors):
+        msg.info(
+            "{} vectors ({} unique keys, {} dimensions)".format(
+                len(nlp.vocab.vectors),
+                nlp.vocab.vectors.n_keys,
+                nlp.vocab.vectors_length,
+            )
+        )
+    else:
+        msg.info("No word vectors present in the model")
+
+    if "ner" in pipeline:
+        # Get all unique NER labels present in the data
+        labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
+        label_counts = gold_data["ner"]
+        model_labels = _get_labels_from_model(nlp, "ner")
+        new_labels = [l for l in labels if l not in model_labels]
+        existing_labels = [l for l in labels if l in model_labels]
+        has_low_data_warning = False
+        has_no_neg_warning = False
+
+        msg.divider("Named Entity Recognition")
+        msg.info(
+            "{} new {}, {} existing {}".format(
+                len(new_labels),
+                "label" if len(new_labels) == 1 else "labels",
+                len(existing_labels),
+                "label" if len(existing_labels) == 1 else "labels",
+            )
+        )
+        missing_values = label_counts["-"]
+        msg.text(
+            "{} missing {} (tokens with '-' label)".format(
+                missing_values, "value" if missing_values == 1 else "values"
+            )
+        )
+        if new_labels:
+            labels_with_counts = [
+                (label, count)
+                for label, count in label_counts.most_common()
+                if label != "-"
+            ]
+            labels_with_counts = _format_labels(labels_with_counts, counts=True)
+            msg.text("New: {}".format(labels_with_counts), show=verbose)
+        if existing_labels:
+            msg.text(
+                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
+            )
+
+        for label in new_labels:
+            if label_counts[label] <= NEW_LABEL_THRESHOLD:
+                msg.warn(
+                    "Low number of examples for new label '{}' ({})".format(
+                        label, label_counts[label]
+                    )
+                )
+                has_low_data_warning = True
+
+                with msg.loading("Analyzing label distribution..."):
+                    neg_docs = _get_examples_without_label(train_docs, label)
+                if neg_docs == 0:
+                    msg.warn(
+                        "No examples for texts WITHOUT new label '{}'".format(label)
+                    )
+                    has_no_neg_warning = True
+
+        if not has_low_data_warning:
+            msg.good("Good amount of examples for all labels")
+        if not has_no_neg_warning:
+            msg.good("Examples without occurences available for all labels")
+
+        if has_low_data_warning:
+            msg.text(
+                "To train a new entity type, your data should include at "
+                "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
+                show=verbose,
+            )
+        if has_no_neg_warning:
+            msg.text(
+                "Training data should always include examples of entities "
+                "in context, as well as examples without a given entity "
+                "type.",
+                show=verbose,
+            )
+
+    if "textcat" in pipeline:
+        msg.divider("Text Classification")
+        labels = [label for label in gold_data["textcat"]]
+        model_labels = _get_labels_from_model(nlp, "textcat")
+        new_labels = [l for l in labels if l not in model_labels]
+        existing_labels = [l for l in labels if l in model_labels]
+        msg.info(
+            "Text Classification: {} new label(s), {} existing label(s)".format(
+                len(new_labels), len(existing_labels)
+            )
+        )
+        if new_labels:
+            labels_with_counts = _format_labels(
+                gold_data["textcat"].most_common(), counts=True
+            )
+            msg.text("New: {}".format(labels_with_counts), show=verbose)
+        if existing_labels:
+            msg.text(
+                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
+            )
+
+    if "tagger" in pipeline:
+        msg.divider("Part-of-speech Tagging")
+        labels = [label for label in gold_data["tags"]]
+        tag_map = nlp.Defaults.tag_map
+        msg.info(
+            "{} {} in data ({} {} in tag map)".format(
+                len(labels),
+                "label" if len(labels) == 1 else "labels",
+                len(tag_map),
+                "label" if len(tag_map) == 1 else "labels",
+            )
+        )
+        labels_with_counts = _format_labels(
+            gold_data["tags"].most_common(), counts=True
+        )
+        msg.text(labels_with_counts, show=verbose)
+        non_tagmap = [l for l in labels if l not in tag_map]
+        if not non_tagmap:
+            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
+        for label in non_tagmap:
+            msg.fail(
+                "Label '{}' not found in tag map for language '{}'".format(
+                    label, nlp.lang
+                )
+            )
+
+    if "parser" in pipeline:
+        msg.divider("Dependency Parsing")
+        labels = [label for label in gold_data["deps"]]
+        msg.info(
+            "{} {} in data".format(
+                len(labels), "label" if len(labels) == 1 else "labels"
+            )
+        )
+        labels_with_counts = _format_labels(
+            gold_data["deps"].most_common(), counts=True
+        )
+        msg.text(labels_with_counts, show=verbose)
+
+    msg.divider("Summary")
+    good_counts = msg.counts[MESSAGES.GOOD]
+    warn_counts = msg.counts[MESSAGES.WARN]
+    fail_counts = msg.counts[MESSAGES.FAIL]
+    if good_counts:
+        msg.good(
+            "{} {} passed".format(
+                good_counts, "check" if good_counts == 1 else "checks"
+            )
+        )
+    if warn_counts:
+        msg.warn(
+            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
+        )
+    if fail_counts:
+        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
+
+    if fail_counts:
+        sys.exit(1)
+
+
+def _load_file(file_path, msg):
+    file_name = file_path.parts[-1]
+    if file_path.suffix == ".json":
+        data = read_json(file_path)
+        msg.good("Loaded {}".format(file_name))
+        return data
+    elif file_path.suffix == ".jsonl":
+        data = read_jsonl(file_path)
+        msg.good("Loaded {}".format(file_name))
+        return data
+    msg.fail(
+        "Can't load file extension {}".format(file_path.suffix),
+        "Expected .json or .jsonl",
+        exits=1,
+    )
+
+
+def _compile_gold(train_docs, pipeline):
+    data = {
+        "ner": Counter(),
+        "cats": Counter(),
+        "tags": Counter(),
+        "deps": Counter(),
+        "words": Counter(),
+        "n_words": 0,
+        "texts": set(),
+    }
+    for doc, gold in train_docs:
+        data["words"].update(gold.words)
+        data["n_words"] += len(gold.words)
+        data["texts"].add(doc.text)
+        if "ner" in pipeline:
+            for label in gold.ner:
+                if label.startswith(("B-", "U-")):
+                    combined_label = label.split("-")[1]
+                    data["ner"][combined_label] += 1
+                elif label == "-":
+                    data["ner"]["-"] += 1
+        if "textcat" in pipeline:
+            data["cats"].update(gold.cats)
+        if "tagger" in pipeline:
+            data["tags"].update(gold.tags)
+        if "parser" in pipeline:
+            data["deps"].update(gold.labels)
+    return data
+
+
+def _format_labels(labels, counts=False):
+    if counts:
+        return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
+    return ", ".join(["'{}'".format(l) for l in labels])
+
+
+def _get_ner_counts(data):
+    counter = Counter()
+    for doc, gold in data:
+        for label in gold.ner:
+            if label.startswith(("B-", "U-")):
+                combined_label = label.split("-")[1]
+                counter[combined_label] += 1
+            elif label == "-":
+                counter["-"] += 1
+    return counter
+
+
+def _get_examples_without_label(data, label):
+    count = 0
+    for doc, gold in data:
+        labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
+        if label not in labels:
+            count += 1
+    return count
+
+
+def _get_labels_from_model(nlp, pipe_name):
+    if pipe_name not in nlp.pipe_names:
+        return set()
+    pipe = nlp.get_pipe(pipe_name)
+    return pipe.labels
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -6,34 +6,37 @@ import requests
 import os
 import subprocess
 import sys
+from wasabi import Printer

 from ._messages import Messages
 from .link import link
-from ..util import prints, get_package_path
+from ..util import get_package_path
 from .. import about


+msg = Printer()
+
+
@plac.annotations(
-    model=("model to download, shortcut or name", "positional", None, str),
-    direct=("force direct download. Needs model name with version and won't "
-            "perform compatibility check", "flag", "d", bool),
-    pip_args=("additional arguments to be passed to `pip install` when "
-              "installing the model"))
+    model=("Model to download (shortcut or name)", "positional", None, str),
+    direct=("Force direct download of name + version", "flag", "d", bool),
+    pip_args=("additional arguments to be passed to `pip install` on model install"),
+)
 def download(model, direct=False, *pip_args):
    """
    Download compatible model from default download path using pip. Model
    can be shortcut, model name or, if --direct flag is set, full model name
-    with version.
+    with version. For direct downloads, the compatibility check will be skipped.
    """
    if direct:
-        dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args)
+        dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args)
    else:
        shortcuts = get_json(about.__shortcuts__, "available shortcuts")
        model_name = shortcuts.get(model, model)
        compatibility = get_compatibility()
        version = get_version(model_name, compatibility)
-        dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}'
-                            .format(m=model_name, v=version), pip_args)
+        dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
+        dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
        if dl != 0:  # if download subprocess doesn't return 0, exit
            sys.exit(dl)
        try:
@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args):
            # subprocess
            package_path = get_package_path(model_name)
            link(model_name, model, force=True, model_path=package_path)
-        except:
+        except:  # noqa: E722
            # Dirty, but since spacy.download and the auto-linking is
            # mostly a convenience wrapper, it's best to show a success
            # message and loading instructions, even if linking fails.
-            prints(Messages.M001, title=Messages.M002.format(name=model_name))
+            msg.warn(Messages.M002.format(name=model_name), Messages.M001)


 def get_json(url, desc):
    r = requests.get(url)
    if r.status_code != 200:
-        prints(Messages.M004.format(desc=desc, version=about.__version__),
-               title=Messages.M003.format(code=r.status_code), exits=1)
+        msg.fail(
+            Messages.M003.format(code=r.status_code),
+            Messages.M004.format(desc=desc, version=about.__version__),
+            exits=1,
+        )
    return r.json()


 def get_compatibility():
    version = about.__version__
-    version = version.rsplit('.dev', 1)[0]
+    version = version.rsplit(".dev", 1)[0]
    comp_table = get_json(about.__compatibility__, "compatibility table")
-    comp = comp_table['spacy']
+    comp = comp_table["spacy"]
    if version not in comp:
-        prints(Messages.M006.format(version=version), title=Messages.M005,
-               exits=1)
+        msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1)
    return comp[version]


 def get_version(model, comp):
-    model = model.rsplit('.dev', 1)[0]
+    model = model.rsplit(".dev", 1)[0]
    if model not in comp:
-        prints(Messages.M007.format(name=model, version=about.__version__),
-               title=Messages.M005, exits=1)
+        msg.fail(
+            Messages.M005,
+            Messages.M007.format(name=model, version=about.__version__),
+            exits=1,
+        )
    return comp[model][0]


 def download_model(filename, user_pip_args=None):
-    download_url = about.__download_url__ + '/' + filename
-    pip_args = ['--no-cache-dir', '--no-deps']
+    download_url = about.__download_url__ + "/" + filename
+    pip_args = ["--no-cache-dir", "--no-deps"]
    if user_pip_args:
        pip_args.extend(user_pip_args)
-    cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url]
+    cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
    return subprocess.call(cmd, env=os.environ.copy())
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function

 import plac
 from timeit import default_timer as timer
+from wasabi import Printer

 from ._messages import Messages
 from ..gold import GoldCorpus
-from ..util import prints
 from .. import util
 from .. import displacy


@plac.annotations(
-    model=("model name or path", "positional", None, str),
-    data_path=("location of JSON-formatted evaluation data", "positional",
-               None, str),
-    gold_preproc=("use gold preprocessing", "flag", "G", bool),
-    gpu_id=("use GPU", "option", "g", int),
-    displacy_path=("directory to output rendered parses as HTML", "option",
-                   "dp", str),
-    displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
-def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None,
-             displacy_limit=25):
+    model=("Model name or path", "positional", None, str),
+    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
+    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
+    gpu_id=("Use GPU", "option", "g", int),
+    displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
+    displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
+)
+def evaluate(
+    model,
+    data_path,
+    gpu_id=-1,
+    gold_preproc=False,
+    displacy_path=None,
+    displacy_limit=25,
+):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
-
+    msg = Printer()
    util.fix_random_seed()
    if gpu_id >= 0:
        util.use_gpu(gpu_id)
@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
-        prints(data_path, title=Messages.M034, exits=1)
+        msg.fail(Messages.M034, data_path, exits=1)
    if displacy_path and not displacy_path.exists():
-        prints(displacy_path, title=Messages.M035, exits=1)
+        msg.fail(Messages.M035, displacy_path, exits=1)
    corpus = GoldCorpus(data_path, data_path)
    nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
    scorer = nlp.evaluate(dev_docs, verbose=False)
    end = timer()
    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
-    print_results(scorer, time=end - begin, words=nwords,
-                  wps=nwords / (end - begin))
+    results = {
+        "Time": "%.2f s" % end - begin,
+        "Words": nwords,
+        "Words/s": "%.0f" % nwords / (end - begin),
+        "TOK": "%.2f" % scorer.token_acc,
+        "POS": "%.2f" % scorer.tags_acc,
+        "UAS": "%.2f" % scorer.uas,
+        "LAS": "%.2f" % scorer.las,
+        "NER P": "%.2f" % scorer.ents_p,
+        "NER R": "%.2f" % scorer.ents_r,
+        "NER F": "%.2f" % scorer.ents_f,
+    }
+    msg.table(results, title="Results")
+
    if displacy_path:
        docs, golds = zip(*dev_docs)
-        render_deps = 'parser' in nlp.meta.get('pipeline', [])
-        render_ents = 'ner' in nlp.meta.get('pipeline', [])
-        render_parses(docs, displacy_path, model_name=model,
-                      limit=displacy_limit, deps=render_deps, ents=render_ents)
-        prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
+        render_deps = "parser" in nlp.meta.get("pipeline", [])
+        render_ents = "ner" in nlp.meta.get("pipeline", [])
+        render_parses(
+            docs,
+            displacy_path,
+            model_name=model,
+            limit=displacy_limit,
+            deps=render_deps,
+            ents=render_ents,
+        )
+        msg.good(Messages.M036.format(n=displacy_limit), displacy_path)


-def render_parses(docs, output_path, model_name='', limit=250, deps=True,
-                  ents=True):
-    docs[0].user_data['title'] = model_name
+def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
+    docs[0].user_data["title"] = model_name
    if ents:
-        with (output_path / 'entities.html').open('w') as file_:
-            html = displacy.render(docs[:limit], style='ent', page=True)
+        with (output_path / "entities.html").open("w") as file_:
+            html = displacy.render(docs[:limit], style="ent", page=True)
            file_.write(html)
    if deps:
-        with (output_path / 'parses.html').open('w') as file_:
-            html = displacy.render(docs[:limit], style='dep', page=True,
-                                   options={'compact': True})
+        with (output_path / "parses.html").open("w") as file_:
+            html = displacy.render(
+                docs[:limit], style="dep", page=True, options={"compact": True}
+            )
            file_.write(html)


 def print_progress(itn, losses, dev_scores, wps=0.0):
    scores = {}
-    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
-                'ents_p', 'ents_r', 'ents_f', 'wps']:
+    for col in [
+        "dep_loss",
+        "tag_loss",
+        "uas",
+        "tags_acc",
+        "token_acc",
+        "ents_p",
+        "ents_r",
+        "ents_f",
+        "wps",
+    ]:
        scores[col] = 0.0
-    scores['dep_loss'] = losses.get('parser', 0.0)
-    scores['ner_loss'] = losses.get('ner', 0.0)
-    scores['tag_loss'] = losses.get('tagger', 0.0)
+    scores["dep_loss"] = losses.get("parser", 0.0)
+    scores["ner_loss"] = losses.get("ner", 0.0)
+    scores["tag_loss"] = losses.get("tagger", 0.0)
    scores.update(dev_scores)
-    scores['wps'] = wps
-    tpl = '\t'.join((
-        '{:d}',
-        '{dep_loss:.3f}',
-        '{ner_loss:.3f}',
-        '{uas:.3f}',
-        '{ents_p:.3f}',
-        '{ents_r:.3f}',
-        '{ents_f:.3f}',
-        '{tags_acc:.3f}',
-        '{token_acc:.3f}',
-        '{wps:.1f}'))
+    scores["wps"] = wps
+    tpl = "\t".join(
+        (
+            "{:d}",
+            "{dep_loss:.3f}",
+            "{ner_loss:.3f}",
+            "{uas:.3f}",
+            "{ents_p:.3f}",
+            "{ents_r:.3f}",
+            "{ents_f:.3f}",
+            "{tags_acc:.3f}",
+            "{token_acc:.3f}",
+            "{wps:.1f}",
+        )
+    )
    print(tpl.format(itn, **scores))
-
-
-def print_results(scorer, time, words, wps):
-    results = {
-        'Time': '%.2f s' % time,
-        'Words': words,
-        'Words/s': '%.0f' % wps,
-        'TOK': '%.2f' % scorer.token_acc,
-        'POS': '%.2f' % scorer.tags_acc,
-        'UAS': '%.2f' % scorer.uas,
-        'LAS': '%.2f' % scorer.las,
-        'NER P': '%.2f' % scorer.ents_p,
-        'NER R': '%.2f' % scorer.ents_r,
-        'NER F': '%.2f' % scorer.ents_f}
-    util.print_table(results, title="Results")
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import plac
 import platform
 from pathlib import Path
+from wasabi import Printer

 from ._messages import Messages
 from ..compat import path2str
@ -12,56 +13,65 @@ from .. import about


@plac.annotations(
-    model=("optional: shortcut link of model", "positional", None, str),
-    markdown=("generate Markdown for GitHub issues", "flag", "md", str),
-    silent=("don't print anything (just return)", "flag", "s"))
+    model=("Optional shortcut link of model", "positional", None, str),
+    markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
+    silent=("Don't print anything (just return)", "flag", "s"),
+)
 def info(model=None, markdown=False, silent=False):
-    """Print info about spaCy installation. If a model shortcut link is
+    """
+    Print info about spaCy installation. If a model shortcut link is
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
+    msg = Printer()
    if model:
        if util.is_package(model):
            model_path = util.get_package_path(model)
        else:
            model_path = util.get_data_path() / model
-        meta_path = model_path / 'meta.json'
+        meta_path = model_path / "meta.json"
        if not meta_path.is_file():
-            util.prints(meta_path, title=Messages.M020, exits=1)
+            msg.fail(Messages.M020, meta_path, exits=1)
        meta = util.read_json(meta_path)
        if model_path.resolve() != model_path:
-            meta['link'] = path2str(model_path)
-            meta['source'] = path2str(model_path.resolve())
+            meta["link"] = path2str(model_path)
+            meta["source"] = path2str(model_path.resolve())
        else:
-            meta['source'] = path2str(model_path)
+            meta["source"] = path2str(model_path)
        if not silent:
-            print_info(meta, 'model %s' % model, markdown)
+            title = "Info about model '{}'".format(model)
+            model_meta = {
+                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
+            }
+            if markdown:
+                util.print_markdown(model_meta, title=title)
+            else:
+                msg.table(model_meta, title=title)
        return meta
-    data = {'spaCy version': about.__version__,
-            'Location': path2str(Path(__file__).parent.parent),
-            'Platform': platform.platform(),
-            'Python version': platform.python_version(),
-            'Models': list_models()}
+    data = {
+        "spaCy version": about.__version__,
+        "Location": path2str(Path(__file__).parent.parent),
+        "Platform": platform.platform(),
+        "Python version": platform.python_version(),
+        "Models": list_models(),
+    }
    if not silent:
-        print_info(data, 'spaCy', markdown)
+        title = "Info about spaCy"
+        if markdown:
+            util.print_markdown(data, title=title)
+        else:
+            msg.table(data, title=title)
    return data


-def print_info(data, title, markdown):
-    title = 'Info about %s' % title
-    if markdown:
-        util.print_markdown(data, title=title)
-    else:
-        util.print_table(data, title=title)
-
-
 def list_models():
    def exclude_dir(dir_name):
        # exclude common cache directories and hidden directories
-        exclude = ['cache', 'pycache', '__pycache__']
-        return dir_name in exclude or dir_name.startswith('.')
+        exclude = ("cache", "pycache", "__pycache__")
+        return dir_name in exclude or dir_name.startswith(".")
+
    data_path = util.get_data_path()
    if data_path:
        models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
-        return ', '.join([m for m in models if not exclude_dir(m)])
-    return '-'
+        return ", ".join([m for m in models if not exclude_dir(m)])
+    return "-"
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -11,13 +11,12 @@ from preshed.counter import PreshCounter
 import tarfile
 import gzip
 import zipfile
-import ujson as json
-from spacy.lexeme import intify_attrs
+from wasabi import Printer

 from ._messages import Messages
 from ..vectors import Vectors
 from ..errors import Errors, Warnings, user_warning
-from ..util import prints, ensure_path, get_lang_class
+from ..util import ensure_path, get_lang_class, read_jsonl

 try:
    import ftfy
@ -25,121 +24,133 @@ except ImportError:
    ftfy = None


+msg = Printer()
+
+
@plac.annotations(
-    lang=("model language", "positional", None, str),
-    output_dir=("model output directory", "positional", None, Path),
-    freqs_loc=("location of words frequencies file", "option", "f", Path),
-    jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path),
-    clusters_loc=("optional: location of brown clusters data",
-                  "option", "c", str),
-    vectors_loc=("optional: location of vectors file in Word2Vec format "
-                 "(either as .txt or zipped as .zip or .tar.gz)", "option",
-                 "v", str),
-    prune_vectors=("optional: number of vectors to prune to",
-                   "option", "V", int)
+    lang=("Model language", "positional", None, str),
+    output_dir=("Model output directory", "positional", None, Path),
+    freqs_loc=("Location of words frequencies file", "option", "f", Path),
+    jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
+    clusters_loc=("Optional location of brown clusters data", "option", "c", str),
+    vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
+    prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
 )
-def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None,
-               vectors_loc=None, prune_vectors=-1):
+def init_model(
+    lang,
+    output_dir,
+    freqs_loc=None,
+    clusters_loc=None,
+    jsonl_loc=None,
+    vectors_loc=None,
+    prune_vectors=-1,
+):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
-    and word vectors.
+    and word vectors. If vectors are provided in Word2Vec format, they can
+    be either a .txt or zipped as a .zip or .tar.gz.
    """
    if jsonl_loc is not None:
        if freqs_loc is not None or clusters_loc is not None:
-            settings = ['-j']
+            settings = ["-j"]
            if freqs_loc:
-                settings.append('-f')
+                settings.append("-f")
            if clusters_loc:
-                settings.append('-c')
-            prints(' '.join(settings),
-                title=(
-                    "The -f and -c arguments are deprecated, and not compatible "
-                    "with the -j argument, which should specify the same information. "
-                    "Either merge the frequencies and clusters data into the "
-                    "jsonl-formatted file (recommended), or use only the -f and "
-                    "-c files, without the other lexical attributes."))
+                settings.append("-c")
+            msg.warn(Messages.M063, Messages.M064)
        jsonl_loc = ensure_path(jsonl_loc)
-        lex_attrs = (json.loads(line) for line in jsonl_loc.open())
+        lex_attrs = read_jsonl(jsonl_loc)
    else:
        clusters_loc = ensure_path(clusters_loc)
        freqs_loc = ensure_path(freqs_loc)
        if freqs_loc is not None and not freqs_loc.exists():
-            prints(freqs_loc, title=Messages.M037, exits=1)
+            msg.fail(Messages.M037, freqs_loc, exits=1)
        lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)

-    nlp = create_model(lang, lex_attrs)
+    with msg.loading("Creating model..."):
+        nlp = create_model(lang, lex_attrs)
+    msg.good("Successfully created model")
    if vectors_loc is not None:
        add_vectors(nlp, vectors_loc, prune_vectors)
    vec_added = len(nlp.vocab.vectors)
    lex_added = len(nlp.vocab)
-    prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
-           title=Messages.M038)
+    msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added))
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    return nlp

+
 def open_file(loc):
-    '''Handle .gz, .tar.gz or unzipped files'''
+    """Handle .gz, .tar.gz or unzipped files"""
    loc = ensure_path(loc)
-    print("Open loc")
    if tarfile.is_tarfile(str(loc)):
-        return tarfile.open(str(loc), 'r:gz')
-    elif loc.parts[-1].endswith('gz'):
-        return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
-    elif loc.parts[-1].endswith('zip'):
+        return tarfile.open(str(loc), "r:gz")
+    elif loc.parts[-1].endswith("gz"):
+        return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
+    elif loc.parts[-1].endswith("zip"):
        zip_file = zipfile.ZipFile(str(loc))
        names = zip_file.namelist()
        file_ = zip_file.open(names[0])
-        return (line.decode('utf8') for line in file_)
+        return (line.decode("utf8") for line in file_)
    else:
-        return loc.open('r', encoding='utf8')
+        return loc.open("r", encoding="utf8")
+

 def read_attrs_from_deprecated(freqs_loc, clusters_loc):
-    probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
-    clusters = read_clusters(clusters_loc) if clusters_loc else {}
+    with msg.loading("Counting frequencies..."):
+        probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
+    msg.good("Counted frequencies")
+    with msg.loading("Reading clusters..."):
+        clusters = read_clusters(clusters_loc) if clusters_loc else {}
+    msg.good("Read clusters")
    lex_attrs = []
    sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
    for i, (word, prob) in tqdm(enumerate(sorted_probs)):
-        attrs = {'orth': word, 'id': i, 'prob': prob}
+        attrs = {"orth": word, "id": i, "prob": prob}
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
-            attrs['cluster'] = int(clusters[word][::-1], 2)
+            attrs["cluster"] = int(clusters[word][::-1], 2)
        else:
-            attrs['cluster'] = 0
+            attrs["cluster"] = 0
        lex_attrs.append(attrs)
    return lex_attrs


 def create_model(lang, lex_attrs):
-    print("Creating model...")
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = 0
    lex_added = 0
    for attrs in lex_attrs:
-        if 'settings' in attrs:
+        if "settings" in attrs:
            continue
-        lexeme = nlp.vocab[attrs['orth']]
+        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.set_attrs(**attrs)
        lexeme.is_oov = False
        lex_added += 1
        lex_added += 1
    oov_prob = min(lex.prob for lex in nlp.vocab)
-    nlp.vocab.cfg.update({'oov_prob': oov_prob-1})
+    nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
    return nlp

+
 def add_vectors(nlp, vectors_loc, prune_vectors):
    vectors_loc = ensure_path(vectors_loc)
-    if vectors_loc and vectors_loc.parts[-1].endswith('.npz'):
-        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb')))
+    if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
+        nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
        for lex in nlp.vocab:
            if lex.rank:
                nlp.vocab.vectors.add(lex.orth, row=lex.rank)
    else:
-        vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
+        if vectors_loc:
+            with msg.loading("Reading vectors from {}".format(vectors_loc)):
+                vectors_data, vector_keys = read_vectors(vectors_loc)
+            msg.good("Loaded vectors from {}".format(vectors_loc))
+        else:
+            vectors_data, vector_keys = (None, None)
        if vector_keys is not None:
            for word in vector_keys:
                if word not in nlp.vocab:
@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
                    lexeme.is_oov = False
        if vectors_data is not None:
            nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
-    nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang']
-    nlp.meta['vectors']['name'] = nlp.vocab.vectors.name
+    nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
+    nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
    if prune_vectors >= 1:
        nlp.vocab.prune_vectors(prune_vectors)

+
 def read_vectors(vectors_loc):
-    print("Reading vectors from %s" % vectors_loc)
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
-    vectors_data = numpy.zeros(shape=shape, dtype='f')
+    vectors_data = numpy.zeros(shape=shape, dtype="f")
    vectors_keys = []
    for i, line in enumerate(tqdm(f)):
        line = line.rstrip()
-        pieces = line.rsplit(' ', vectors_data.shape[1]+1)
+        pieces = line.rsplit(" ", vectors_data.shape[1] + 1)
        word = pieces.pop(0)
        if len(pieces) != vectors_data.shape[1]:
-            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
-        vectors_data[i] = numpy.asarray(pieces, dtype='f')
+            msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
+        vectors_data[i] = numpy.asarray(pieces, dtype="f")
        vectors_keys.append(word)
    return vectors_data, vectors_keys


 def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
-    print("Counting frequencies...")
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
        for i, line in enumerate(f):
-            freq, doc_freq, key = line.rstrip().split('\t', 2)
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
            freq = int(freq)
            counts.inc(i + 1, freq)
            total += freq
@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    probs = {}
    with freqs_loc.open() as f:
        for line in tqdm(f):
-            freq, doc_freq, key = line.rstrip().split('\t', 2)
+            freq, doc_freq, key = line.rstrip().split("\t", 2)
            doc_freq = int(doc_freq)
            freq = int(freq)
            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):


 def read_clusters(clusters_loc):
-    print("Reading clusters...")
    clusters = {}
    if ftfy is None:
        user_warning(Warnings.W004)
@ -213,7 +222,7 @@ def read_clusters(clusters_loc):
            if int(freq) >= 3:
                clusters[word] = cluster
            else:
-                clusters[word] = '0'
+                clusters[word] = "0"
    # Expand clusters with re-casing
    for word, cluster in list(clusters.items()):
        if word.lower() not in clusters:
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -3,51 +3,54 @@ from __future__ import unicode_literals

 import plac
 from pathlib import Path
+from wasabi import Printer

 from ._messages import Messages
 from ..compat import symlink_to, path2str
-from ..util import prints
 from .. import util


@plac.annotations(
    origin=("package name or local path to model", "positional", None, str),
    link_name=("name of shortuct link to create", "positional", None, str),
-    force=("force overwriting of existing link", "flag", "f", bool))
+    force=("force overwriting of existing link", "flag", "f", bool),
+)
 def link(origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
+    msg = Printer()
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
-        prints(Messages.M009.format(path=path2str(model_path)),
-               title=Messages.M008, exits=1)
+        msg.fail(
+            Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1
+        )
    data_path = util.get_data_path()
    if not data_path or not data_path.exists():
        spacy_loc = Path(__file__).parent.parent
-        prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
+        msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1)
    link_path = util.get_data_path() / link_name
    if link_path.is_symlink() and not force:
-        prints(Messages.M013, title=Messages.M012.format(name=link_name),
-               exits=1)
+        msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1)
    elif link_path.is_symlink():  # does a symlink exist?
        # NB: It's important to check for is_symlink here and not for exists,
        # because invalid/outdated symlinks would return False otherwise.
        link_path.unlink()
-    elif link_path.exists(): # does it exist otherwise?
+    elif link_path.exists():  # does it exist otherwise?
        # NB: Check this last because valid symlinks also "exist".
-        prints(Messages.M015, link_path,
-               title=Messages.M014.format(name=link_name), exits=1)
-    msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
+        msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1)
+    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
    try:
        symlink_to(link_path, model_path)
-    except:
+    except:  # noqa: E722
        # This is quite dirty, but just making sure other errors are caught.
-        prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
+        msg.fail(Messages.M016.format(name=link_name), Messages.M017)
+        msg.text(details)
        raise
-    prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
+    msg.good(Messages.M018, details)
+    msg.text(Messages.M019.format(name=link_name))
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -4,109 +4,106 @@ from __future__ import unicode_literals
 import plac
 import shutil
 from pathlib import Path
+from wasabi import Printer, get_raw_input

 from ._messages import Messages
 from ..compat import path2str, json_dumps
-from ..util import prints
 from .. import util
 from .. import about


@plac.annotations(
-    input_dir=("directory with model data", "positional", None, str),
-    output_dir=("output parent directory", "positional", None, str),
-    meta_path=("path to meta.json", "option", "m", str),
-    create_meta=("create meta.json, even if one exists in directory – if "
-                 "existing meta is found, entries are shown as defaults in "
-                 "the command line prompt", "flag", "c", bool),
-    force=("force overwriting of existing model directory in output directory",
-           "flag", "f", bool))
-def package(input_dir, output_dir, meta_path=None, create_meta=False,
-            force=False):
+    input_dir=("Directory with model data", "positional", None, str),
+    output_dir=("Output parent directory", "positional", None, str),
+    meta_path=("Path to meta.json", "option", "m", str),
+    create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
+    force=("Force overwriting existing model in output directory", "flag", "f", bool),
+)
+def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
-    output directory, and model data will be copied over.
+    output directory, and model data will be copied over. If --create-meta is
+    set and a meta.json already exists in the output directory, the existing
+    values will be used as the defaults in the command-line prompt.
    """
+    msg = Printer()
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
-        prints(input_path, title=Messages.M008, exits=1)
+        msg.fail(Messages.M008, input_path, exits=1)
    if not output_path or not output_path.exists():
-        prints(output_path, title=Messages.M040, exits=1)
+        msg.fail(Messages.M040, output_path, exits=1)
    if meta_path and not meta_path.exists():
-        prints(meta_path, title=Messages.M020, exits=1)
+        msg.fail(Messages.M020, meta_path, exits=1)

-    meta_path = meta_path or input_path / 'meta.json'
+    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = util.read_json(meta_path)
-        if not create_meta:  # only print this if user doesn't want to overwrite
-            prints(meta_path, title=Messages.M041)
+        if not create_meta:  # only print if user doesn't want to overwrite
+            msg.good(Messages.M041, meta_path)
        else:
-            meta = generate_meta(input_dir, meta)
-    meta = validate_meta(meta, ['lang', 'name', 'version'])
-    model_name = meta['lang'] + '_' + meta['name']
-    model_name_v = model_name + '-' + meta['version']
+            meta = generate_meta(input_dir, meta, msg)
+    for key in ("lang", "name", "version"):
+        if key not in meta or meta[key] == "":
+            msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1)
+    model_name = meta["lang"] + "_" + meta["name"]
+    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name

-    create_dirs(package_path, force)
-    shutil.copytree(path2str(input_path),
-                    path2str(package_path / model_name_v))
-    create_file(main_path / 'meta.json', json_dumps(meta))
-    create_file(main_path / 'setup.py', TEMPLATE_SETUP)
-    create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
-    create_file(package_path / '__init__.py', TEMPLATE_INIT)
-    prints(main_path, Messages.M043,
-           title=Messages.M042.format(name=model_name_v))
-
-
-def create_dirs(package_path, force):
    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
-            prints(package_path, Messages.M045, title=Messages.M044, exits=1)
+            msg.fail(
+                Messages.M044,
+                Messages.M045.format(path=path2str(package_path)),
+                exits=1,
+            )
    Path.mkdir(package_path, parents=True)
+    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
+    create_file(main_path / "meta.json", json_dumps(meta))
+    create_file(main_path / "setup.py", TEMPLATE_SETUP)
+    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
+    create_file(package_path / "__init__.py", TEMPLATE_INIT)
+    msg.good(Messages.M042.format(name=model_name_v), main_path)
+    msg.text(Messages.M043)


 def create_file(file_path, contents):
    file_path.touch()
-    file_path.open('w', encoding='utf-8').write(contents)
+    file_path.open("w", encoding="utf-8").write(contents)


-def generate_meta(model_path, existing_meta):
+def generate_meta(model_path, existing_meta, msg):
    meta = existing_meta or {}
-    settings = [('lang', 'Model language', meta.get('lang', 'en')),
-                ('name', 'Model name', meta.get('name', 'model')),
-                ('version', 'Model version', meta.get('version', '0.0.0')),
-                ('spacy_version', 'Required spaCy version',
-                 '>=%s,<3.0.0' % about.__version__),
-                ('description', 'Model description',
-                  meta.get('description', False)),
-                ('author', 'Author', meta.get('author', False)),
-                ('email', 'Author email', meta.get('email', False)),
-                ('url', 'Author website', meta.get('url', False)),
-                ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
+    settings = [
+        ("lang", "Model language", meta.get("lang", "en")),
+        ("name", "Model name", meta.get("name", "model")),
+        ("version", "Model version", meta.get("version", "0.0.0")),
+        ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
+        ("description", "Model description", meta.get("description", False)),
+        ("author", "Author", meta.get("author", False)),
+        ("email", "Author email", meta.get("email", False)),
+        ("url", "Author website", meta.get("url", False)),
+        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
+    ]
    nlp = util.load_model_from_path(Path(model_path))
-    meta['pipeline'] = nlp.pipe_names
-    meta['vectors'] = {'width': nlp.vocab.vectors_length,
-                       'vectors': len(nlp.vocab.vectors),
-                       'keys': nlp.vocab.vectors.n_keys}
-    prints(Messages.M047, title=Messages.M046)
+    meta["pipeline"] = nlp.pipe_names
+    meta["vectors"] = {
+        "width": nlp.vocab.vectors_length,
+        "vectors": len(nlp.vocab.vectors),
+        "keys": nlp.vocab.vectors.n_keys,
+    }
+    msg.divider(Messages.M046)
+    msg.text(Messages.M047)
    for setting, desc, default in settings:
-        response = util.get_raw_input(desc, default)
-        meta[setting] = default if response == '' and default else response
-    if about.__title__ != 'spacy':
-        meta['parent_package'] = about.__title__
-    return meta
-
-
-def validate_meta(meta, keys):
-    for key in keys:
-        if key not in meta or meta[key] == '':
-            prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
+        response = get_raw_input(desc, default)
+        meta[setting] = default if response == "" and default else response
+    if about.__title__ != "spacy":
+        meta["parent_package"] = about.__title__
    return meta


--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -1,66 +1,148 @@
-'''This script is experimental.
-
-Try pre-training the CNN component of the text categorizer using a cheap
-language modelling-like objective. Specifically, we load pre-trained vectors
-(from something like word2vec, GloVe, FastText etc), and use the CNN to
-predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
-we're not merely doing compression here, because heavy dropout is applied,
-including over the input words. This means the model must often (50% of the time)
-use the context in order to predict the word.
-
-To evaluate the technique, we're pre-training with the 50k texts from the IMDB
-corpus, and then training with only 100 labels. Note that it's a bit dirty to
-pre-train with the development data, but also not *so* terrible: we're not using
-the development labels, after all --- only the unlabelled text.
-'''
+# coding: utf8
 from __future__ import print_function, unicode_literals
+
 import plac
 import random
 import numpy
 import time
-import ujson as json
-from pathlib import Path
+import ujson
 import sys
 from collections import Counter
-
-import spacy
-from spacy.tokens import Doc
-from spacy.attrs import ID, HEAD
-from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
-from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
+from pathlib import Path
 from thinc.v2v import Affine, Maxout
 from thinc.api import wrap
 from thinc.misc import LayerNorm as LN
+from thinc.neural.util import prefer_gpu
+from wasabi import Printer
+
+from ..tokens import Doc
+from ..attrs import ID, HEAD
+from ..compat import json_dumps
+from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
+from .. import util


-def prefer_gpu():
-    used = spacy.util.use_gpu(0)
-    if used is None:
-        return False
-    else:
-        import cupy.random
-        cupy.random.seed(0)
-        return True
+@plac.annotations(
+    texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
+    vectors_model=("Name or path to vectors model to learn from"),
+    output_dir=("Directory to write models each epoch", "positional", None, str),
+    width=("Width of CNN layers", "option", "cw", int),
+    depth=("Depth of CNN layers", "option", "cd", int),
+    embed_rows=("Embedding rows", "option", "er", int),
+    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
+    dropout=("Dropout", "option", "d", float),
+    seed=("Seed for random number generators", "option", "s", float),
+    nr_iter=("Number of iterations to pretrain", "option", "i", int),
+)
+def pretrain(
+    texts_loc,
+    vectors_model,
+    output_dir,
+    width=96,
+    depth=4,
+    embed_rows=2000,
+    use_vectors=False,
+    dropout=0.2,
+    nr_iter=1000,
+    seed=0,
+):
+    """
+    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
+    using an approximate language-modelling objective. Specifically, we load
+    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
+    vectors which match the pre-trained ones. The weights are saved to a directory
+    after each epoch. You can then pass a path to one of these pre-trained weights
+    files to the 'spacy train' command.

+    This technique may be especially helpful if you have little labelled data.
+    However, it's still quite experimental, so your mileage may vary.

-def load_texts(path):
-    '''Load inputs from a jsonl file.
-    
-    Each line should be a dict like {"text": "..."}
-    '''
-    path = ensure_path(path)
-    with path.open('r', encoding='utf8') as file_:
-        texts = [json.loads(line) for line in file_]
-    random.shuffle(texts)
-    return texts
+    To load the weights back in during 'spacy train', you need to ensure
+    all settings are the same between pretraining and training. The API and
+    errors around this need some improvement.
+    """
+    config = dict(locals())
+    msg = Printer()
+    util.fix_random_seed(seed)
+
+    has_gpu = prefer_gpu()
+    msg.info("Using GPU" if has_gpu else "Not using GPU")
+
+    output_dir = Path(output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir()
+        msg.good("Created output directory")
+    util.write_json(output_dir / "config.json", config)
+    msg.good("Saved settings to config.json")
+
+    # Load texts from file or stdin
+    if texts_loc != "-":  # reading from a file
+        texts_loc = Path(texts_loc)
+        if not texts_loc.exists():
+            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
+        with msg.loading("Loading input texts..."):
+            texts = list(util.read_jsonl(texts_loc))
+        msg.good("Loaded input texts")
+        random.shuffle(texts)
+    else:  # reading from stdin
+        msg.text("Reading input text from stdin...")
+        texts = stream_texts()
+
+    with msg.loading("Loading model '{}'...".format(vectors_model)):
+        nlp = util.load_model(vectors_model)
+    msg.good("Loaded model '{}'".format(vectors_model))
+    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
+    model = create_pretraining_model(
+        nlp,
+        Tok2Vec(
+            width,
+            embed_rows,
+            conv_depth=depth,
+            pretrained_vectors=pretrained_vectors,
+            bilstm_depth=0,  # Requires PyTorch. Experimental.
+            cnn_maxout_pieces=2,  # You can try setting this higher
+            subword_features=True,
+        ),
+    )  # Set to False for character models, e.g. Chinese
+    optimizer = create_default_optimizer(model.ops)
+    tracker = ProgressTracker()
+    msg.divider("Pre-training tok2vec layer")
+    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
+    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
+    for epoch in range(nr_iter):
+        for batch in util.minibatch_by_words(
+            ((text, None) for text in texts), size=5000
+        ):
+            docs = make_docs(nlp, [text for (text, _) in batch])
+            loss = make_update(model, docs, optimizer, drop=dropout)
+            progress = tracker.update(epoch, loss, docs)
+            if progress:
+                msg.row(progress, **row_settings)
+                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
+                    break
+        with model.use_params(optimizer.averages):
+            with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
+                file_.write(model.tok2vec.to_bytes())
+            log = {
+                "nr_word": tracker.nr_word,
+                "loss": tracker.loss,
+                "epoch_loss": tracker.epoch_loss,
+                "epoch": epoch,
+            }
+            with (output_dir / "log.jsonl").open("a") as file_:
+                file_.write(json_dumps(log) + "\n")
+        tracker.epoch_loss = 0.0
+        if texts_loc != "-":
+            # Reshuffle the texts if texts were loaded from a file
+            random.shuffle(texts)


 def stream_texts():
    for line in sys.stdin:
-        yield json.loads(line)
+        yield ujson.loads(line)


-def make_update(model, docs, optimizer, drop=0.):
+def make_update(model, docs, optimizer, drop=0.0):
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.):
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
-    loss = float((gradients**2).mean())
+    loss = float((gradients ** 2).mean())
    return loss


@ -98,7 +180,7 @@ def make_docs(nlp, batch):

 def get_vectors_loss(ops, docs, prediction):
    """Compute a mean-squared error loss between the documents' vectors and
-    the prediction.    
+    the prediction.

    Note that this is ripe for customization! We could compute the vectors
    in some other word, e.g. with an LSTM language model, or use some other
@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction):


 def create_pretraining_model(nlp, tok2vec):
-    '''Define a network for the pretraining. We simply add an output layer onto
+    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
-    '''
+    """
    output_size = nlp.vocab.vectors.data.shape[1]
    output_layer = chain(
-        LN(Maxout(300, pieces=3)),
-        zero_init(Affine(output_size, drop_factor=0.0))
+        LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
    )
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
    # the shape of the models' components exactly. So what we cann
    # "tok2vec" has to be the same set of processes as what the components do.
    tok2vec = chain(tok2vec, flatten)
-    model = chain(
-        tok2vec,
-        output_layer
-    )
+    model = chain(tok2vec, output_layer)
    model = masked_language_model(nlp.vocab, model)
    model.tok2vec = tok2vec
    model.output_layer = output_layer
-    model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
+    model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
    return model


 def masked_language_model(vocab, model, mask_prob=0.15):
-    '''Convert a model into a BERT-style masked language model'''
+    """Convert a model into a BERT-style masked language model"""

    random_words = RandomWords(vocab)
-    def mlm_forward(docs, drop=0.):
+
+    def mlm_forward(docs, drop=0.0):
        mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
        mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
        output, backprop = model.begin_update(docs, drop=drop)

        def mlm_backward(d_output, sgd=None):
-            d_output *= 1-mask
+            d_output *= 1 - mask
            return backprop(d_output, sgd=sgd)

        return output, mlm_backward
@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):

 def apply_mask(docs, random_words, mask_prob=0.15):
    N = sum(len(doc) for doc in docs)
-    mask = numpy.random.uniform(0., 1.0, (N,))
+    mask = numpy.random.uniform(0.0, 1.0, (N,))
    mask = mask >= mask_prob
    i = 0
    masked_docs = []
@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15):
    return mask, masked_docs


-def replace_word(word, random_words, mask='[MASK]'):
+def replace_word(word, random_words, mask="[MASK]"):
    roll = random.random()
    if roll < 0.8:
        return mask
@ -193,23 +272,25 @@ def replace_word(word, random_words, mask='[MASK]'):
    else:
        return word

+
 class RandomWords(object):
    def __init__(self, vocab):
        self.words = [lex.text for lex in vocab if lex.prob != 0.0]
        self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
        self.words = self.words[:10000]
        self.probs = self.probs[:10000]
-        self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
+        self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
        self.probs /= self.probs.sum()
        self._cache = []

    def next(self):
        if not self._cache:
-            self._cache.extend(numpy.random.choice(len(self.words), 10000,
-                p=self.probs))
+            self._cache.extend(
+                numpy.random.choice(len(self.words), 10000, p=self.probs)
+            )
        index = self._cache.pop()
        return self.words[index]
- 
+

 class ProgressTracker(object):
    def __init__(self, frequency=1000000):
@ -245,76 +326,3 @@ class ProgressTracker(object):
            return status
        else:
            return None
-
-
-@plac.annotations(
-    texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
-    vectors_model=("Name or path to vectors model to learn from"),
-    output_dir=("Directory to write models each epoch", "positional", None, str),
-    width=("Width of CNN layers", "option", "cw", int),
-    depth=("Depth of CNN layers", "option", "cd", int),
-    embed_rows=("Embedding rows", "option", "er", int),
-    use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
-    dropout=("Dropout", "option", "d", float),
-    seed=("Seed for random number generators", "option", "s", float),
-    nr_iter=("Number of iterations to pretrain", "option", "i", int),
-)
-def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
-        embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
-    """
-    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
-    using an approximate language-modelling objective. Specifically, we load
-    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
-    vectors which match the pre-trained ones. The weights are saved to a directory
-    after each epoch. You can then pass a path to one of these pre-trained weights
-    files to the 'spacy train' command.
-
-    This technique may be especially helpful if you have little labelled data.
-    However, it's still quite experimental, so your mileage may vary.
-
-    To load the weights back in during 'spacy train', you need to ensure
-    all settings are the same between pretraining and training. The API and
-    errors around this need some improvement.
-    """
-    config = dict(locals())
-    output_dir = ensure_path(output_dir)
-    random.seed(seed)
-    numpy.random.seed(seed)
-    if not output_dir.exists():
-        output_dir.mkdir()
-    with (output_dir / 'config.json').open('w') as file_:
-        file_.write(json.dumps(config))
-    has_gpu = prefer_gpu()
-    print("Use GPU?", has_gpu)
-    nlp = spacy.load(vectors_model)
-    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
-    model = create_pretraining_model(nlp,
-                Tok2Vec(width, embed_rows,
-                    conv_depth=depth,
-                    pretrained_vectors=pretrained_vectors,
-                    bilstm_depth=0, # Requires PyTorch. Experimental.
-                    cnn_maxout_pieces=2, # You can try setting this higher
-                    subword_features=True)) # Set to False for character models, e.g. Chinese
-    optimizer = create_default_optimizer(model.ops)
-    tracker = ProgressTracker()
-    print('Epoch', '#Words', 'Loss', 'w/s')
-    texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) 
-    for epoch in range(nr_iter):
-        for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
-            docs = make_docs(nlp, [text for (text, _) in batch])
-            loss = make_update(model, docs, optimizer, drop=dropout)
-            progress = tracker.update(epoch, loss, docs)
-            if progress:
-                print(*progress)
-                if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
-                    break
-        with model.use_params(optimizer.averages):
-            with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
-                file_.write(model.tok2vec.to_bytes())
-            with (output_dir / 'log.jsonl').open('a') as file_:
-                file_.write(json.dumps({'nr_word': tracker.nr_word,
-                    'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
-                    'epoch': epoch}) + '\n')
-        tracker.epoch_loss = 0.0
-        if texts_loc != '-':
-            texts = load_texts(texts_loc)
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -6,45 +6,64 @@ from pathlib import Path
 import ujson
 import cProfile
 import pstats
-
-import spacy
 import sys
 import tqdm
 import cytoolz
 import thinc.extra.datasets
+from wasabi import Printer

-
-def read_inputs(loc):
-    if loc is None:
-        file_ = sys.stdin
-        file_ = (line.encode('utf8') for line in file_)
-    else:
-        file_ = Path(loc).open()
-    for line in file_:
-        data = ujson.loads(line)
-        text = data['text']
-        yield text
+from ..util import load_model


@plac.annotations(
-    lang=("model/language", "positional", None, str),
-    inputs=("Location of input file", "positional", None, read_inputs))
-def profile(lang, inputs=None):
+    model=("Model to load", "positional", None, str),
+    inputs=("Location of input file. '-' for stdin.", "positional", None, str),
+    n_texts=("Maximum number of texts to use if available", "option", "n", int),
+)
+def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
+    Input should be formatted as one JSON object per line with a key "text".
+    It can either be provided as a JSONL file, or be read from sys.sytdin.
+    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
+    msg = Printer()
+    if inputs is not None:
+        inputs = _read_inputs(inputs, msg)
    if inputs is None:
-        imdb_train, _ = thinc.extra.datasets.imdb()
-        inputs, _ = zip(*imdb_train)
-        inputs = inputs[:25000]
-    nlp = spacy.load(lang)
-    texts = list(cytoolz.take(10000, inputs))
-    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
-                    "Profile.prof")
+        n_inputs = 25000
+        with msg.loading("Loading IMDB dataset via Thinc..."):
+            imdb_train, _ = thinc.extra.datasets.imdb()
+            inputs, _ = zip(*imdb_train)
+        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
+        inputs = inputs[:n_inputs]
+    with msg.loading("Loading model '{}'...".format(model)):
+        nlp = load_model(model)
+    msg.good("Loaded model '{}'".format(model))
+    texts = list(cytoolz.take(n_texts, inputs))
+    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
+    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()


 def parse_texts(nlp, texts):
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
        pass
+
+
+def _read_inputs(loc, msg):
+    if loc == "-":
+        msg.info("Reading input from sys.stdin")
+        file_ = sys.stdin
+        file_ = (line.encode("utf8") for line in file_)
+    else:
+        input_path = Path(loc)
+        if not input_path.exists() or not input_path.is_file():
+            msg.fail("Not a valid input data file", loc, exits=1)
+        msg.info("Using data from {}".format(input_path.parts[-1]))
+        file_ = input_path.open()
+    for line in file_:
+        data = ujson.loads(line)
+        text = data["text"]
+        yield text
--- a/spacy/cli/schemas/init.py
+++ b/spacy/cli/schemas/init.py
@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from pathlib import Path
+from jsonschema import Draft4Validator
+
+from ...errors import Errors
+from ...util import read_json
+
+
+SCHEMAS = {}
+
+
+def get_schema(name):
+    """Get the JSON schema for a given name. Looks for a .json file in
+    spacy.cli.schemas, validates the schema and raises ValueError if not found.
+
+    EXAMPLE:
+        >>> schema = get_schema('training')
+
+    name (unicode): The name of the schema.
+    RETURNS (dict): The JSON schema.
+    """
+    if name not in SCHEMAS:
+        schema_path = Path(__file__).parent / "{}.json".format(name)
+        if not schema_path.exists():
+            raise ValueError(Errors.E104.format(name=name))
+        schema = read_json(schema_path)
+        # TODO: replace with (stable) Draft6Validator, if available
+        validator = Draft4Validator(schema)
+        validator.check_schema(schema)
+        SCHEMAS[name] = schema
+    return SCHEMAS[name]
+
+
+def validate_json(data, schema):
+    """Validate data against a given JSON schema (see https://json-schema.org).
+
+    data: JSON-serializable data to validate.
+    schema (dict): The JSON schema.
+    RETURNS (list): A list of error messages, if available.
+    """
+    validator = Draft4Validator(schema)
+    errors = []
+    for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
+        if err.path:
+            err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
+        else:
+            err_path = ""
+        errors.append(err.message + " " + err_path)
+    return errors
--- a/spacy/cli/schemas/meta.json
+++ b/spacy/cli/schemas/meta.json
@ -0,0 +1,128 @@
+{
+  "$schema": "http://json-schema.org/draft-06/schema",
+  "type": "object",
+  "properties": {
+    "lang": {
+      "title": "Two-letter language code, e.g. 'en'",
+      "type": "string",
+      "minLength": 2,
+      "maxLength": 2,
+      "pattern": "^[a-z]*$"
+    },
+    "name": {
+      "title": "Model name",
+      "type": "string",
+      "minLength": 1,
+      "pattern": "^[a-z_]*$"
+    },
+    "version": {
+      "title": "Model version",
+      "type": "string",
+      "minLength": 1,
+      "pattern": "^[0-9a-z.-]*$"
+    },
+    "spacy_version": {
+      "title": "Compatible spaCy version identifier",
+      "type": "string",
+      "minLength": 1,
+      "pattern": "^[0-9a-z.-><=]*$"
+    },
+    "parent_package": {
+      "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
+      "type": "string",
+      "minLength": 1,
+      "default": "spacy"
+    },
+    "pipeline": {
+      "title": "Names of pipeline components",
+      "type": "array",
+      "items": {
+        "type": "string",
+        "minLength": 1
+      }
+    },
+    "description": {
+      "title": "Model description",
+      "type": "string"
+    },
+    "license": {
+      "title": "Model license",
+      "type": "string"
+    },
+    "author": {
+      "title": "Model author name",
+      "type": "string"
+    },
+    "email": {
+      "title": "Model author email",
+      "type": "string",
+      "format": "email"
+    },
+    "url": {
+      "title": "Model author URL",
+      "type": "string",
+      "format": "uri"
+    },
+    "sources": {
+      "title": "Training data sources",
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "vectors": {
+      "title": "Included word vectors",
+      "type": "object",
+      "properties": {
+        "keys": {
+          "title": "Number of unique keys",
+          "type": "integer",
+          "minimum": 0
+        },
+        "vectors": {
+          "title": "Number of unique vectors",
+          "type": "integer",
+          "minimum": 0
+        },
+        "width": {
+          "title": "Number of dimensions",
+          "type": "integer",
+          "minimum": 0
+        }
+      }
+    },
+    "accuracy": {
+      "title": "Accuracy numbers",
+      "type": "object",
+      "patternProperties": {
+        "*": {
+          "type": "number",
+          "minimum": 0.0
+        }
+      }
+    },
+    "speed": {
+      "title": "Speed evaluation numbers",
+      "type": "object",
+      "patternProperties": {
+        "*": {
+          "oneOf": [
+            {
+              "type": "number",
+              "minimum": 0.0
+            },
+            {
+              "type": "integer",
+              "minimum": 0
+            }
+          ]
+        }
+      }
+    }
+  },
+  "required": [
+    "lang",
+    "name",
+    "version"
+  ]
+}
--- a/spacy/cli/schemas/training.json
+++ b/spacy/cli/schemas/training.json
@ -0,0 +1,146 @@
+{
+  "$schema": "http://json-schema.org/draft-06/schema",
+  "title": "Training data for spaCy models",
+  "type": "array",
+  "items": {
+    "type": "object",
+    "properties": {
+      "text": {
+        "title": "The text of the training example",
+        "type": "string",
+        "minLength": 1
+      },
+      "ents": {
+        "title": "Named entity spans in the text",
+        "type": "array",
+        "items": {
+          "type": "object",
+          "properties": {
+            "start": {
+              "title": "Start character offset of the span",
+              "type": "integer",
+              "minimum": 0
+            },
+            "end": {
+              "title": "End character offset of the span",
+              "type": "integer",
+              "minimum": 0
+            },
+            "label": {
+              "title": "Entity label",
+              "type": "string",
+              "minLength": 1,
+              "pattern": "^[A-Z0-9]*$"
+            }
+          },
+          "required": [
+            "start",
+            "end",
+            "label"
+          ]
+        }
+      },
+      "sents": {
+        "title": "Sentence spans in the text",
+        "type": "array",
+        "items": {
+          "type": "object",
+          "properties": {
+            "start": {
+              "title": "Start character offset of the span",
+              "type": "integer",
+              "minimum": 0
+            },
+            "end": {
+              "title": "End character offset of the span",
+              "type": "integer",
+              "minimum": 0
+            }
+          },
+          "required": [
+            "start",
+            "end"
+          ]
+        }
+      },
+      "cats": {
+        "title": "Text categories for the text classifier",
+        "type": "object",
+        "patternProperties": {
+          "*": {
+            "title": "A text category",
+            "oneOf": [
+              {
+                "type": "boolean"
+              },
+              {
+                "type": "number",
+                "minimum": 0
+              }
+            ]
+          }
+        },
+        "propertyNames": {
+          "pattern": "^[A-Z0-9]*$",
+          "minLength": 1
+        }
+      },
+      "tokens": {
+        "title": "The tokens in the text",
+        "type": "array",
+        "items": {
+          "type": "object",
+          "minProperties": 1,
+          "properties": {
+            "id": {
+              "title": "Token ID, usually token index",
+              "type": "integer",
+              "minimum": 0
+            },
+            "start": {
+              "title": "Start character offset of the token",
+              "type": "integer",
+              "minimum": 0
+            },
+            "end": {
+              "title": "End character offset of the token",
+              "type": "integer",
+              "minimum": 0
+            },
+            "pos": {
+              "title": "Coarse-grained part-of-speech tag",
+              "type": "string",
+              "minLength": 1
+            },
+            "tag": {
+              "title": "Fine-grained part-of-speech tag",
+              "type": "string",
+              "minLength": 1
+            },
+            "dep": {
+              "title": "Dependency label",
+              "type": "string",
+              "minLength": 1
+            },
+            "head": {
+              "title": "Index of the token's head",
+              "type": "integer",
+              "minimum": 0
+            }
+          },
+          "required": [
+            "start",
+            "end"
+          ]
+        }
+      },
+      "_": {
+        "title": "Custom user space",
+        "type": "object"
+      }
+    },
+    "required": [
+      "text"
+    ]
+  }
+}
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -6,213 +6,296 @@ from pathlib import Path
 import tqdm
 from thinc.neural._classes.model import Model
 from timeit import default_timer as timer
-import json
 import shutil
+from wasabi import Printer

 from ._messages import Messages
+from .._ml import create_default_optimizer
 from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 from ..gold import GoldCorpus
-from ..util import prints, minibatch, minibatch_by_words
 from .. import util
 from .. import about
-from .. import displacy
-from ..compat import json_dumps
+
+
+# Take dropout and batch size as generators of values -- dropout
+# starts high and decays sharply, to force the optimizer to explore.
+# Batch size starts at 1 and grows, so that we make updates quickly
+# at the beginning of training.
+dropout_rates = util.decaying(
+    util.env_opt("dropout_from", 0.2),
+    util.env_opt("dropout_to", 0.2),
+    util.env_opt("dropout_decay", 0.0),
+)
+batch_sizes = util.compounding(
+    util.env_opt("batch_from", 1000),
+    util.env_opt("batch_to", 1000),
+    util.env_opt("batch_compound", 1.001),
+)


@plac.annotations(
-    lang=("model language", "positional", None, str),
-    output_dir=("output directory to store model in", "positional", None, str),
-    train_data=("location of JSON-formatted training data", "positional",
-                None, str),
-    dev_data=("location of JSON-formatted development data (optional)",
-              "positional", None, str),
-    n_iter=("number of iterations", "option", "n", int),
-    n_sents=("number of sentences", "option", "ns", int),
+    lang=("Model language", "positional", None, str),
+    output_path=("Output directory to store model in", "positional", None, Path),
+    train_path=("Location of JSON-formatted training data", "positional", None, Path),
+    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
+    base_model=("Name of model to update (optional)", "option", "b", str),
+    pipeline=("Comma-separated names of pipeline components", "option", "p", str),
+    vectors=("Model to load vectors from", "option", "v", str),
+    n_iter=("Number of iterations", "option", "n", int),
+    n_examples=("Number of examples", "option", "ns", int),
    use_gpu=("Use GPU", "option", "g", int),
-    vectors=("Model to load vectors from", "option", "v"),
-    no_tagger=("Don't train tagger", "flag", "T", bool),
-    no_parser=("Don't train parser", "flag", "P", bool),
-    no_entities=("Don't train NER", "flag", "N", bool),
-    parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
-    noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float),
-    entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
-    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    version=("Model version", "option", "V", str),
-    meta_path=("Optional path to meta.json. All relevant properties will be "
-               "overwritten.", "option", "m", Path),
-    init_tok2vec=("Path to pretrained weights for the token-to-vector parts "
-        "of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
-    verbose=("Display more information for debug", "option", None, bool))
-def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
-         parser_multitasks='', entity_multitasks='', init_tok2vec=None,
-          use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0,
-          no_parser=False, no_entities=False, gold_preproc=False,
-          version="0.0.0", meta_path=None, verbose=False):
+    meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
+    init_tok2vec=(
+        "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
+        "option",
+        "t2v",
+        Path,
+    ),
+    parser_multitasks=(
+        "Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
+        "option",
+        "pt",
+        str,
+    ),
+    entity_multitasks=(
+        "Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
+        "option",
+        "et",
+        str,
+    ),
+    noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
+    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
+    learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
+    verbose=("Display more information for debug", "flag", "VV", bool),
+    debug=("Run data diagnostics before training", "flag", "D", bool),
+)
+def train(
+    lang,
+    output_path,
+    train_path,
+    dev_path,
+    base_model=None,
+    pipeline="tagger,parser,ner",
+    vectors=None,
+    n_iter=30,
+    n_examples=0,
+    use_gpu=-1,
+    version="0.0.0",
+    meta_path=None,
+    init_tok2vec=None,
+    parser_multitasks="",
+    entity_multitasks="",
+    noise_level=0.0,
+    gold_preproc=False,
+    learn_tokens=False,
+    verbose=False,
+    debug=False,
+):
    """
-    Train a model. Expects data in spaCy's JSON format.
+    Train or update a spaCy model. Requires data to be formatted in spaCy's
+    JSON format. To convert data from other formats, use the `spacy convert`
+    command.
    """
+    msg = Printer()
    util.fix_random_seed()
-    util.set_env_log(True)
-    n_sents = n_sents or None
-    output_path = util.ensure_path(output_dir)
-    train_path = util.ensure_path(train_data)
-    dev_path = util.ensure_path(dev_data)
+    util.set_env_log(verbose)
+
+    # Make sure all files and paths exists if they are needed
+    train_path = util.ensure_path(train_path)
+    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
-    if not train_path.exists():
-        prints(train_path, title=Messages.M050, exits=1)
-    if dev_path and not dev_path.exists():
-        prints(dev_path, title=Messages.M051, exits=1)
+    if not train_path or not train_path.exists():
+        msg.fail(Messages.M050, train_path, exits=1)
+    if not dev_path or not dev_path.exists():
+        msg.fail(Messages.M051, dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
-        prints(meta_path, title=Messages.M020, exits=1)
+        msg.fail(Messages.M020, meta_path, exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
-        prints(Messages.M053.format(meta_type=type(meta)),
-               title=Messages.M052, exits=1)
-    meta.setdefault('lang', lang)
-    meta.setdefault('name', 'unnamed')
-    
+        msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
+    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
+        msg.fail(Messages.M062, Messages.M065)
    if not output_path.exists():
        output_path.mkdir()

-    print("Counting training words (limit=%s" % n_sents)
-    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
-    n_train_words = corpus.count_train()
-    print(n_train_words)
-    pipeline = ['tagger', 'parser', 'ner']
-    if no_tagger and 'tagger' in pipeline:
-        pipeline.remove('tagger')
-    if no_parser and 'parser' in pipeline:
-        pipeline.remove('parser')
-    if no_entities and 'ner' in pipeline:
-        pipeline.remove('ner')
+    # Set up the base model and pipeline. If a base model is specified, load
+    # the model and make sure the pipeline matches the pipeline setting. If
+    # training starts from a blank model, intitalize the language class.
+    pipeline = [p.strip() for p in pipeline.split(",")]
+    msg.text(Messages.M055.format(pipeline=pipeline))
+    if base_model:
+        msg.text(Messages.M056.format(model=base_model))
+        nlp = util.load_model(base_model)
+        if nlp.lang != lang:
+            msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1)
+        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
+        nlp.disable_pipes(*other_pipes)
+        for pipe in pipeline:
+            if pipe not in nlp.pipe_names:
+                nlp.add_pipe(nlp.create_pipe(pipe))
+    else:
+        msg.text(Messages.M057.format(model=lang))
+        lang_cls = util.get_lang_class(lang)
+        nlp = lang_cls()
+        for pipe in pipeline:
+            nlp.add_pipe(nlp.create_pipe(pipe))
+
+    if learn_tokens:
+        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
-    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
-                                  util.env_opt('dropout_to', 0.1),
-                                  util.env_opt('dropout_decay', 0.0))
-    batch_sizes = util.compounding(util.env_opt('batch_from', 750),
-                                   util.env_opt('batch_to', 750),
-                                   util.env_opt('batch_compound', 1.001))
+    dropout_rates = util.decaying(
+        util.env_opt("dropout_from", 0.1),
+        util.env_opt("dropout_to", 0.1),
+        util.env_opt("dropout_decay", 0.0),
+    )
+    batch_sizes = util.compounding(
+        util.env_opt("batch_from", 750),
+        util.env_opt("batch_to", 750),
+        util.env_opt("batch_compound", 1.001),
+    )
    lang_class = util.get_lang_class(lang)
    nlp = lang_class()
-    meta['pipeline'] = pipeline
+    meta["pipeline"] = pipeline
    nlp.meta.update(meta)
    if vectors:
-        print("Load vectors model", vectors)
-        util.load_model(vectors, vocab=nlp.vocab)
-        for lex in nlp.vocab:
-            values = {}
-            for attr, func in nlp.vocab.lex_attr_getters.items():
-                # These attrs are expected to be set by data. Others should
-                # be set by calling the language functions.
-                if attr not in (CLUSTER, PROB, IS_OOV, LANG):
-                    values[lex.vocab.strings[attr]] = func(lex.orth_)
-            lex.set_attrs(**values)
-            lex.is_oov = False
-    for name in pipeline:
-        nlp.add_pipe(nlp.create_pipe(name), name=name)
-    nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
-    if parser_multitasks:
-        for objective in parser_multitasks.split(','):
-            nlp.parser.add_multitask_objective(objective)
-    if entity_multitasks:
-        for objective in entity_multitasks.split(','):
-            nlp.entity.add_multitask_objective(objective)
-    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
-    if init_tok2vec is not None:
-        loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
-        print("Loaded pretrained tok2vec for:", loaded)
+        msg.text(Messages.M058.format(model=vectors))
+        _load_vectors(nlp, vectors)
+
+    # Multitask objectives
+    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
+    for pipe_name, multitasks in multitask_options:
+        if multitasks:
+            if pipe_name not in pipeline:
+                msg.fail(Messages.M059.format(pipe=pipe_name))
+            pipe = nlp.get_pipe(pipe_name)
+            for objective in multitasks.split(","):
+                pipe.add_multitask_objective(objective)
+
+    # Prepare training corpus
+    msg.text(Messages.M060.format(limit=n_examples))
+    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
+    n_train_words = corpus.count_train()
+
+    if base_model:
+        # Start with an existing model, use default optimizer
+        optimizer = create_default_optimizer(Model.ops)
+    else:
+        # Start with a blank model, call begin_training
+        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None

-    print("Itn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS")
+    # Load in pre-trained weights
+    if init_tok2vec is not None:
+        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
+        msg.text(Messages.M071.format(components=components))
+
+    print(
+        "\nItn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS"
+    )
    try:
        for i in range(n_iter):
-            train_docs = corpus.train_docs(nlp, noise_level=noise_level,
-                                           gold_preproc=gold_preproc, max_length=0)
+            train_docs = corpus.train_docs(
+                nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
+            )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
-                for batch in minibatch_by_words(train_docs, size=batch_sizes):
+                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
-                    nlp.update(docs, golds, sgd=optimizer,
-                               drop=next(dropout_rates), losses=losses)
+                    nlp.update(
+                        docs,
+                        golds,
+                        sgd=optimizer,
+                        drop=next(dropout_rates),
+                        losses=losses,
+                    )
                    pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
-                epoch_model_path = output_path / ('model%d' % i)
+                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
-                dev_docs = list(corpus.dev_docs(
-                                nlp_loaded,
-                                gold_preproc=gold_preproc))
+                dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                start_time = timer()
-                scorer = nlp_loaded.evaluate(dev_docs, verbose)
+                scorer = nlp_loaded.evaluate(dev_docs, debug)
                end_time = timer()
                if use_gpu < 0:
                    gpu_wps = None
-                    cpu_wps = nwords/(end_time-start_time)
+                    cpu_wps = nwords / (end_time - start_time)
                else:
-                    gpu_wps = nwords/(end_time-start_time)
-                    with Model.use_device('cpu'):
+                    gpu_wps = nwords / (end_time - start_time)
+                    with Model.use_device("cpu"):
                        nlp_loaded = util.load_model_from_path(epoch_model_path)
-                        dev_docs = list(corpus.dev_docs(
-                                        nlp_loaded, gold_preproc=gold_preproc))
+                        dev_docs = list(
+                            corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
+                        )
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs)
                        end_time = timer()
-                        cpu_wps = nwords/(end_time-start_time)
-                acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
-                with acc_loc.open('w') as file_:
-                    file_.write(json_dumps(scorer.scores))
-                meta_loc = output_path / ('model%d' % i) / 'meta.json'
-                meta['accuracy'] = scorer.scores
-                meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
-                                 'gpu': gpu_wps}
-                meta['vectors'] = {'width': nlp.vocab.vectors_length,
-                                   'vectors': len(nlp.vocab.vectors),
-                                   'keys': nlp.vocab.vectors.n_keys}
-                meta['lang'] = nlp.lang
-                meta['pipeline'] = pipeline
-                meta['spacy_version'] = '>=%s' % about.__version__
-                meta.setdefault('name', 'model%d' % i)
-                meta.setdefault('version', version)
+                        cpu_wps = nwords / (end_time - start_time)
+                acc_loc = output_path / ("model%d" % i) / "accuracy.json"
+                util.write_json(acc_loc, scorer.scores)

-                with meta_loc.open('w') as file_:
-                    file_.write(json_dumps(meta))
-                util.set_env_log(True)
-            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
-                           gpu_wps=gpu_wps)
+                # Update model meta.json
+                meta["lang"] = nlp.lang
+                meta["pipeline"] = nlp.pipe_names
+                meta["spacy_version"] = ">=%s" % about.__version__
+                meta["accuracy"] = scorer.scores
+                meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps}
+                meta["vectors"] = {
+                    "width": nlp.vocab.vectors_length,
+                    "vectors": len(nlp.vocab.vectors),
+                    "keys": nlp.vocab.vectors.n_keys,
+                }
+                meta.setdefault("name", "model%d" % i)
+                meta.setdefault("version", version)
+                meta_loc = output_path / ("model%d" % i) / "meta.json"
+                util.write_json(meta_loc, meta)
+
+                util.set_env_log(verbose)
+
+            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
    finally:
-        print("Saving model...")
-        with nlp.use_params(optimizer.averages):
-            final_model_path = output_path / 'model-final'
-            nlp.to_disk(final_model_path)
-    components = []
-    if not no_parser:
-        components.append('parser')
-    if not no_tagger:
-        components.append('tagger')
-    if not no_entities:
-        components.append('ner')
-    _collate_best_model(meta, output_path, components)
+        with msg.loading(Messages.M061):
+            with nlp.use_params(optimizer.averages):
+                final_model_path = output_path / "model-final"
+                nlp.to_disk(final_model_path)
+        msg.good(Messages.M066, util.path2str(final_model_path))
+
+    _collate_best_model(meta, output_path, nlp.pipe_names)
+
+
+def _load_vectors(nlp, vectors):
+    util.load_model(vectors, vocab=nlp.vocab)
+    for lex in nlp.vocab:
+        values = {}
+        for attr, func in nlp.vocab.lex_attr_getters.items():
+            # These attrs are expected to be set by data. Others should
+            # be set by calling the language functions.
+            if attr not in (CLUSTER, PROB, IS_OOV, LANG):
+                values[lex.vocab.strings[attr]] = func(lex.orth_)
+        lex.set_attrs(**values)
+        lex.is_oov = False


 def _load_pretrained_tok2vec(nlp, loc):
    """Load pre-trained weights for the 'token-to-vector' part of the component
    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
    """
-    with loc.open('rb') as file_:
+    with loc.open("rb") as file_:
        weights_data = file_.read()
    loaded = []
    for name, component in nlp.pipeline:
-        if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
+        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
            component.tok2vec.from_bytes(weights_data)
            loaded.append(name)
    return loaded
@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components):
    bests = {}
    for component in components:
        bests[component] = _find_best(output_path, component)
-    best_dest = output_path / 'model-best'
-    shutil.copytree(output_path / 'model-final', best_dest)
+    best_dest = output_path / "model-best"
+    shutil.copytree(output_path / "model-final", best_dest)
    for component, best_component_src in bests.items():
        shutil.rmtree(best_dest / component)
        shutil.copytree(best_component_src / component, best_dest / component)
-        with (best_component_src / 'accuracy.json').open() as file_:
-            accs = json.load(file_)
+        accs = util.read_json(best_component_src / "accuracy.json")
        for metric in _get_metrics(component):
-            meta['accuracy'][metric] = accs[metric]
-    with (best_dest / 'meta.json').open('w') as file_:
-        file_.write(json_dumps(meta))
+            meta["accuracy"][metric] = accs[metric]
+    util.write_json(best_dest / "meta.json", meta)


 def _find_best(experiment_dir, component):
    accuracies = []
    for epoch_model in experiment_dir.iterdir():
        if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
-            accs = json.load((epoch_model / "accuracy.json").open())
+            accs = util.read_json(epoch_model / "accuracy.json")
            scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
            accuracies.append((scores, epoch_model))
    if accuracies:
@ -247,6 +328,7 @@ def _find_best(experiment_dir, component):
    else:
        return None

+
 def _get_metrics(component):
    if component == "parser":
        return ("las", "uas", "token_acc")
@ -257,50 +339,40 @@ def _get_metrics(component):
    return ("token_acc",)


-def _render_parses(i, to_render):
-    to_render[0].user_data['title'] = "Batch %d" % i
-    with Path('/tmp/entities.html').open('w') as file_:
-        html = displacy.render(to_render[:5], style='ent', page=True)
-        file_.write(html)
-    with Path('/tmp/parses.html').open('w') as file_:
-        html = displacy.render(to_render[:5], style='dep', page=True)
-        file_.write(html)
-
-
 def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
    scores = {}
-    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
-                'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
+    for col in [
+        "dep_loss",
+        "tag_loss",
+        "uas",
+        "tags_acc",
+        "token_acc",
+        "ents_p",
+        "ents_r",
+        "ents_f",
+        "cpu_wps",
+        "gpu_wps",
+    ]:
        scores[col] = 0.0
-    scores['dep_loss'] = losses.get('parser', 0.0)
-    scores['ner_loss'] = losses.get('ner', 0.0)
-    scores['tag_loss'] = losses.get('tagger', 0.0)
+    scores["dep_loss"] = losses.get("parser", 0.0)
+    scores["ner_loss"] = losses.get("ner", 0.0)
+    scores["tag_loss"] = losses.get("tagger", 0.0)
    scores.update(dev_scores)
-    scores['cpu_wps'] = cpu_wps
-    scores['gpu_wps'] = gpu_wps or 0.0
-    tpl = ''.join((
-        '{:<6d}',
-        '{dep_loss:<10.3f}',
-        '{ner_loss:<10.3f}',
-        '{uas:<8.3f}',
-        '{ents_p:<8.3f}',
-        '{ents_r:<8.3f}',
-        '{ents_f:<8.3f}',
-        '{tags_acc:<8.3f}',
-        '{token_acc:<9.3f}',
-        '{cpu_wps:<9.1f}',
-        '{gpu_wps:.1f}',
-    ))
+    scores["cpu_wps"] = cpu_wps
+    scores["gpu_wps"] = gpu_wps or 0.0
+    tpl = "".join(
+        (
+            "{:<6d}",
+            "{dep_loss:<10.3f}",
+            "{ner_loss:<10.3f}",
+            "{uas:<8.3f}",
+            "{ents_p:<8.3f}",
+            "{ents_r:<8.3f}",
+            "{ents_f:<8.3f}",
+            "{tags_acc:<8.3f}",
+            "{token_acc:<9.3f}",
+            "{cpu_wps:<9.1f}",
+            "{gpu_wps:.1f}",
+        )
+    )
    print(tpl.format(itn, **scores))
-
-
-def print_results(scorer):
-    results = {
-        'TOK': '%.2f' % scorer.token_acc,
-        'POS': '%.2f' % scorer.tags_acc,
-        'UAS': '%.2f' % scorer.uas,
-        'LAS': '%.2f' % scorer.las,
-        'NER P': '%.2f' % scorer.ents_p,
-        'NER R': '%.2f' % scorer.ents_r,
-        'NER F': '%.2f' % scorer.ents_f}
-    util.print_table(results, title="Results")
--- a/spacy/cli/ud/init.py
+++ b/spacy/cli/ud/init.py
@ -0,0 +1,2 @@
+from .conll17_ud_eval import main as ud_evaluate  # noqa: F401
+from .ud_train import main as ud_train  # noqa: F401
--- a/spacy/cli/ud/conll17_ud_eval.py
+++ b/spacy/cli/ud/conll17_ud_eval.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# flake8: noqa

 # CoNLL 2017 UD Parsing evaluation script.
 #
@ -214,7 +215,7 @@ def load_conllu(file):
                start, end = map(int, columns[ID].split("-"))
            except:
                raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
-            
+
            for _ in range(start, end + 1):
                word_line = file.readline().rstrip("\r\n")
                word_columns = word_line.split("\t")
--- a/spacy/cli/ud/ud_run_test.py
+++ b/spacy/cli/ud/ud_run_test.py
@ -1,7 +1,9 @@
-'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
+# flake8: noqa
+"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
 .conllu format for development data, allowing the official scorer to be used.
-'''
+"""
 from __future__ import unicode_literals
+
 import plac
 import tqdm
 from pathlib import Path
@ -11,15 +13,17 @@ import json

 import spacy
 import spacy.util
-from ..tokens import Token, Doc
-from ..gold import GoldParse
-from ..util import compounding, minibatch_by_words
-from ..syntax.nonproj import projectivize
-from ..matcher import Matcher
-#from ..morphology import Fused_begin, Fused_inside
-from .. import displacy
+from ...tokens import Token, Doc
+from ...gold import GoldParse
+from ...util import compounding, minibatch_by_words
+from ...syntax.nonproj import projectivize
+from ...matcher import Matcher
+
+# from ...morphology import Fused_begin, Fused_inside
+from ... import displacy
 from collections import defaultdict, Counter
 from timeit import default_timer as timer
+
 Fused_begin = None
 Fused_inside = None

@ -30,43 +34,45 @@ import cytoolz

 from . import conll17_ud_eval

-from .. import lang
-from .. import lang
-from ..lang import zh
-from ..lang import ja
-from ..lang import ru
+from ... import lang
+from ...lang import zh
+from ...lang import ja
+from ...lang import ru


 ################
 # Data reading #
 ################

-space_re = re.compile('\s+')
+space_re = re.compile("\s+")
+
+
 def split_text(text):
-    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
- 
+    return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
+

 ##############
 # Evaluation #
 ##############

+
 def read_conllu(file_):
    docs = []
    sent = []
    doc = []
    for line in file_:
-        if line.startswith('# newdoc'):
+        if line.startswith("# newdoc"):
            if doc:
                docs.append(doc)
            doc = []
-        elif line.startswith('#'):
+        elif line.startswith("#"):
            continue
        elif not line.strip():
            if sent:
                doc.append(sent)
            sent = []
        else:
-            sent.append(list(line.strip().split('\t')))
+            sent.append(list(line.strip().split("\t")))
            if len(sent[-1]) != 10:
                print(repr(line))
                raise ValueError
@ -78,7 +84,7 @@ def read_conllu(file_):


 def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
-    if text_loc.parts[-1].endswith('.conllu'):
+    if text_loc.parts[-1].endswith(".conllu"):
        docs = []
        with text_loc.open() as file_:
            for conllu_doc in read_conllu(file_):
@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
        for name, component in nlp.pipeline:
            docs = list(component.pipe(docs))
    else:
-        with text_loc.open('r', encoding='utf8') as text_file:
+        with text_loc.open("r", encoding="utf8") as text_file:
            texts = split_text(text_file.read())
            docs = list(nlp.pipe(texts))
-    with sys_loc.open('w', encoding='utf8') as out_file:
+    with sys_loc.open("w", encoding="utf8") as out_file:
        write_conllu(docs, out_file)
-    with gold_loc.open('r', encoding='utf8') as gold_file:
+    with gold_loc.open("r", encoding="utf8") as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
-        with sys_loc.open('r', encoding='utf8') as sys_file:
+        with sys_loc.open("r", encoding="utf8") as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return docs, scores
@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):

 def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
-    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
+    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
-        spans = [doc[start:end+1] for _, start, end in matches]
+        spans = [doc[start : end + 1] for _, start, end in matches]
        offsets = [(span.start_char, span.end_char) for span in spans]
        for start_char, end_char in offsets:
            doc.merge(start_char, end_char)
        # TODO: This shuldn't be necessary? Should be handled in merge
        for word in doc:
            if word.i == word.head.i:
-                word.dep_ = 'ROOT'
+                word.dep_ = "ROOT"
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
-                file_.write(_get_token_conllu(token, k, len(sent)) + '\n')
-            file_.write('\n')
+                file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
+            file_.write("\n")
            for word in sent:
-                if word.head.i == word.i and word.dep_ == 'ROOT':
+                if word.head.i == word.i and word.dep_ == "ROOT":
                    break
            else:
                print("Rootless sentence!")
@ -134,24 +140,34 @@ def write_conllu(docs, file_):


 def _get_token_conllu(token, k, sent_len):
-    if token.check_morph(Fused_begin) and (k+1 < sent_len):
+    if token.check_morph(Fused_begin) and (k + 1 < sent_len):
        n = 1
        text = [token.text]
        while token.nbor(n).check_morph(Fused_inside):
            text.append(token.nbor(n).text)
            n += 1
-        id_ = '%d-%d' % (k+1, (k+n))
-        fields = [id_, ''.join(text)] + ['_'] * 8
-        lines = ['\t'.join(fields)]
+        id_ = "%d-%d" % (k + 1, (k + n))
+        fields = [id_, "".join(text)] + ["_"] * 8
+        lines = ["\t".join(fields)]
    else:
        lines = []
    if token.head.i == token.i:
        head = 0
    else:
        head = k + (token.head.i - token.i) + 1
-    fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
-              str(head), token.dep_.lower(), '_', '_']
-    if token.check_morph(Fused_begin) and (k+1 < sent_len):
+    fields = [
+        str(k + 1),
+        token.text,
+        token.lemma_,
+        token.pos_,
+        token.tag_,
+        "_",
+        str(head),
+        token.dep_.lower(),
+        "_",
+        "_",
+    ]
+    if token.check_morph(Fused_begin) and (k + 1 < sent_len):
        if k == 0:
            fields[1] = token.norm_[0].upper() + token.norm_[1:]
        else:
@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len):
        split_end = token._.split_end
        split_len = (split_end.i - split_start.i) + 1
        n_in_split = token.i - split_start.i
-        subtokens = guess_fused_orths(split_start.text, [''] * split_len)
+        subtokens = guess_fused_orths(split_start.text, [""] * split_len)
        fields[1] = subtokens[n_in_split]

-    lines.append('\t'.join(fields))
-    return '\n'.join(lines)
+    lines.append("\t".join(fields))
+    return "\n".join(lines)


 def guess_fused_orths(word, ud_forms):
-    '''The UD data 'fused tokens' don't necessarily expand to keys that match
+    """The UD data 'fused tokens' don't necessarily expand to keys that match
    the form. We need orths that exact match the string. Here we make a best
-    effort to divide up the word.'''
-    if word == ''.join(ud_forms):
+    effort to divide up the word."""
+    if word == "".join(ud_forms):
        # Happy case: we get a perfect split, with each letter accounted for.
        return ud_forms
    elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms):
        remain = word
        for subtoken in ud_forms:
            assert len(subtoken) >= 1
-            output.append(remain[:len(subtoken)])
-            remain = remain[len(subtoken):]
+            output.append(remain[: len(subtoken)])
+            remain = remain[len(subtoken) :]
        assert len(remain) == 0, (word, ud_forms, remain)
        return output
    else:
        # Let's say word is 6 long, and there are three subtokens. The orths
        # *must* equal the original string. Arbitrarily, split [4, 1, 1]
-        first = word[:len(word)-(len(ud_forms)-1)]
+        first = word[: len(word) - (len(ud_forms) - 1)]
        output = [first]
-        remain = word[len(first):]
+        remain = word[len(first) :]
        for i in range(1, len(ud_forms)):
            assert remain
            output.append(remain[:1])
@ -201,60 +217,50 @@ def guess_fused_orths(word, ud_forms):
        return output


-
 def print_results(name, ud_scores):
    fields = {}
    if ud_scores is not None:
-        fields.update({
-            'words': ud_scores['Words'].f1 * 100,
-            'sents': ud_scores['Sentences'].f1 * 100,
-            'tags': ud_scores['XPOS'].f1 * 100,
-            'uas': ud_scores['UAS'].f1 * 100,
-            'las': ud_scores['LAS'].f1 * 100,
-        })
+        fields.update(
+            {
+                "words": ud_scores["Words"].f1 * 100,
+                "sents": ud_scores["Sentences"].f1 * 100,
+                "tags": ud_scores["XPOS"].f1 * 100,
+                "uas": ud_scores["UAS"].f1 * 100,
+                "las": ud_scores["LAS"].f1 * 100,
+            }
+        )
    else:
-        fields.update({
-            'words': 0.0,
-            'sents': 0.0,
-            'tags': 0.0,
-            'uas': 0.0,
-            'las': 0.0
-        })
-    tpl = '\t'.join((
-        name,
-        '{las:.1f}',
-        '{uas:.1f}',
-        '{tags:.1f}',
-        '{sents:.1f}',
-        '{words:.1f}',
-    ))
+        fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
+    tpl = "\t".join(
+        (name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
+    )
    print(tpl.format(**fields))
    return fields


 def get_token_split_start(token):
-    if token.text == '':
+    if token.text == "":
        assert token.i != 0
        i = -1
-        while token.nbor(i).text == '':
+        while token.nbor(i).text == "":
            i -= 1
        return token.nbor(i)
-    elif (token.i+1) < len(token.doc) and token.nbor(1).text == '':
+    elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
        return token
    else:
        return None


 def get_token_split_end(token):
-    if (token.i+1) == len(token.doc):
-        return token if token.text == '' else None
-    elif token.text != '' and token.nbor(1).text != '':
+    if (token.i + 1) == len(token.doc):
+        return token if token.text == "" else None
+    elif token.text != "" and token.nbor(1).text != "":
        return None
    i = 1
-    while (token.i+i) < len(token.doc) and token.nbor(i).text == '':
+    while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
        i += 1
-    return token.nbor(i-1)
- 
+    return token.nbor(i - 1)
+

 ##################
 # Initialization #
@ -262,54 +268,73 @@ def get_token_split_end(token):


 def load_nlp(experiments_dir, corpus):
-    nlp = spacy.load(experiments_dir / corpus / 'best-model')
+    nlp = spacy.load(experiments_dir / corpus / "best-model")
    return nlp

+
 def initialize_pipeline(nlp, docs, golds, config, device):
-    nlp.add_pipe(nlp.create_pipe('parser'))
+    nlp.add_pipe(nlp.create_pipe("parser"))
    return nlp


@plac.annotations(
-    test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path),
+    test_data_dir=(
+        "Path to Universal Dependencies test data",
+        "positional",
+        None,
+        Path,
+    ),
    experiment_dir=("Parent directory with output model", "positional", None, Path),
-    corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
+    corpus=(
+        "UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
+        "positional",
+        None,
+        str,
+    ),
 )
 def main(test_data_dir, experiment_dir, corpus):
-    Token.set_extension('split_start', getter=get_token_split_start)
-    Token.set_extension('split_end', getter=get_token_split_end)
-    Token.set_extension('begins_fused', default=False)
-    Token.set_extension('inside_fused', default=False)
+    Token.set_extension("split_start", getter=get_token_split_start)
+    Token.set_extension("split_end", getter=get_token_split_end)
+    Token.set_extension("begins_fused", default=False)
+    Token.set_extension("inside_fused", default=False)
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
    lang.ru.Russian.Defaults.use_pymorphy2 = False

    nlp = load_nlp(experiment_dir, corpus)
-    
-    treebank_code = nlp.meta['treebank']
-    for section in ('test', 'dev'):
-        if section == 'dev':
-            section_dir = 'conll17-ud-development-2017-03-19'
-        else:
-            section_dir = 'conll17-ud-test-2017-05-09'
-        text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt')
-        udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu')
-        gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu')

-        header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
-        print('\t'.join(header))
-        inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path}
-        for input_type in ('udp', 'raw'):
+    treebank_code = nlp.meta["treebank"]
+    for section in ("test", "dev"):
+        if section == "dev":
+            section_dir = "conll17-ud-development-2017-03-19"
+        else:
+            section_dir = "conll17-ud-test-2017-05-09"
+        text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
+        udpipe_path = (
+            test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
+        )
+        gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
+
+        header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
+        print("\t".join(header))
+        inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
+        for input_type in ("udp", "raw"):
            input_path = inputs[input_type]
-            output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section)
+            output_path = (
+                experiment_dir / corpus / "{section}.conllu".format(section=section)
+            )

            parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)

            accuracy = print_results(input_type, test_scores)
-            acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section)
-            with open(acc_path, 'w') as file_:
+            acc_path = (
+                experiment_dir
+                / corpus
+                / "{section}-accuracy.json".format(section=section)
+            )
+            with open(acc_path, "w") as file_:
                file_.write(json.dumps(accuracy, indent=2))


-if __name__ == '__main__':
+if __name__ == "__main__":
    plac.call(main)
--- a/spacy/cli/ud/ud_train.py
+++ b/spacy/cli/ud/ud_train.py
@ -1,7 +1,9 @@
-'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
+# flake8: noqa
+"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
 .conllu format for development data, allowing the official scorer to be used.
-'''
+"""
 from __future__ import unicode_literals
+
 import plac
 import tqdm
 from pathlib import Path
@ -11,12 +13,12 @@ import json

 import spacy
 import spacy.util
-from ..tokens import Token, Doc
-from ..gold import GoldParse
-from ..util import compounding, minibatch, minibatch_by_words
-from ..syntax.nonproj import projectivize
-from ..matcher import Matcher
-from .. import displacy
+from ...tokens import Token, Doc
+from ...gold import GoldParse
+from ...util import compounding, minibatch, minibatch_by_words
+from ...syntax.nonproj import projectivize
+from ...matcher import Matcher
+from ... import displacy
 from collections import defaultdict, Counter
 from timeit import default_timer as timer

@ -27,10 +29,9 @@ import cytoolz

 from . import conll17_ud_eval

-from .. import lang
-from .. import lang
-from ..lang import zh
-from ..lang import ja
+from ... import lang
+from ...lang import zh
+from ...lang import ja

 try:
    import torch
@ -42,17 +43,26 @@ except ImportError:
 # Data reading #
 ################

-space_re = re.compile('\s+')
-def split_text(text):
-    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
- 
+space_re = re.compile("\s+")

-def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
-              max_doc_length=None, limit=None):
-    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+
+def split_text(text):
+    return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
+
+
+def read_data(
+    nlp,
+    conllu_file,
+    text_file,
+    raw_text=True,
+    oracle_segments=False,
+    max_doc_length=None,
+    limit=None,
+):
+    """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
-    created from the gold-standard segments. At least one must be True.'''
+    created from the gold-standard segments. At least one must be True."""
    if not raw_text and not oracle_segments:
        raise ValueError("At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
-                if '.' in id_:
+                if "." in id_:
                    continue
-                if '-' in id_:
+                if "-" in id_:
                    continue
-                id_ = int(id_)-1
-                head = int(head)-1 if head != '0' else id_
-                sent['words'].append(word)
-                sent['tags'].append(tag)
-                sent['heads'].append(head)
-                sent['deps'].append('ROOT' if dep == 'root' else dep)
-                sent['spaces'].append(space_after == '_')
-            sent['entities'] = ['-'] * len(sent['words'])
-            sent['heads'], sent['deps'] = projectivize(sent['heads'],
-                                                       sent['deps'])
+                id_ = int(id_) - 1
+                head = int(head) - 1 if head != "0" else id_
+                sent["words"].append(word)
+                sent["tags"].append(tag)
+                sent["heads"].append(head)
+                sent["deps"].append("ROOT" if dep == "root" else dep)
+                sent["spaces"].append(space_after == "_")
+            sent["entities"] = ["-"] * len(sent["words"])
+            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
            if oracle_segments:
-                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
+                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
                golds.append(GoldParse(docs[-1], **sent))

            sent_annots.append(sent)
@ -107,18 +116,18 @@ def read_conllu(file_):
    sent = []
    doc = []
    for line in file_:
-        if line.startswith('# newdoc'):
+        if line.startswith("# newdoc"):
            if doc:
                docs.append(doc)
            doc = []
-        elif line.startswith('#'):
+        elif line.startswith("#"):
            continue
        elif not line.strip():
            if sent:
                doc.append(sent)
            sent = []
        else:
-            sent.append(list(line.strip().split('\t')))
+            sent.append(list(line.strip().split("\t")))
            if len(sent[-1]) != 10:
                print(repr(line))
                raise ValueError
@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
    flat = defaultdict(list)
    sent_starts = []
    for sent in sent_annots:
-        flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
-        for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
+        flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
+        for field in ["words", "tags", "deps", "entities", "spaces"]:
            flat[field].extend(sent[field])
        sent_starts.append(True)
-        sent_starts.extend([False] * (len(sent['words'])-1))
+        sent_starts.extend([False] * (len(sent["words"]) - 1))
    # Construct text if necessary
-    assert len(flat['words']) == len(flat['spaces'])
+    assert len(flat["words"]) == len(flat["spaces"])
    if text is None:
-        text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) 
+        text = "".join(
+            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
+        )
    doc = nlp.make_doc(text)
-    flat.pop('spaces')
+    flat.pop("spaces")
    gold = GoldParse(doc, **flat)
    gold.sent_starts = sent_starts
    for i in range(len(gold.heads)):
@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):

    return doc, gold

+
 #############################
 # Data transforms for spaCy #
 #############################

+
 def golds_to_gold_tuples(docs, golds):
-    '''Get out the annoying 'tuples' format used by begin_training, given the
-    GoldParse objects.'''
+    """Get out the annoying 'tuples' format used by begin_training, given the
+    GoldParse objects."""
    tuples = []
    for doc, gold in zip(docs, golds):
        text = doc.text
@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds):
 # Evaluation #
 ##############

+
 def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
-    if text_loc.parts[-1].endswith('.conllu'):
+    if text_loc.parts[-1].endswith(".conllu"):
        docs = []
        with text_loc.open() as file_:
            for conllu_doc in read_conllu(file_):
@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
        for name, component in nlp.pipeline:
            docs = list(component.pipe(docs))
    else:
-        with text_loc.open('r', encoding='utf8') as text_file:
+        with text_loc.open("r", encoding="utf8") as text_file:
            texts = split_text(text_file.read())
            docs = list(nlp.pipe(texts))
-    with sys_loc.open('w', encoding='utf8') as out_file:
+    with sys_loc.open("w", encoding="utf8") as out_file:
        write_conllu(docs, out_file)
-    with gold_loc.open('r', encoding='utf8') as gold_file:
+    with gold_loc.open("r", encoding="utf8") as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
-        with sys_loc.open('r', encoding='utf8') as sys_file:
+        with sys_loc.open("r", encoding="utf8") as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return docs, scores
@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):

 def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
-    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
+    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
-        spans = [doc[start:end+1] for _, start, end in matches]
+        spans = [doc[start : end + 1] for _, start, end in matches]
        offsets = [(span.start_char, span.end_char) for span in spans]
        for start_char, end_char in offsets:
            doc.merge(start_char, end_char)
@ -213,65 +227,82 @@ def write_conllu(docs, file_):
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                if token.head.i > sent[-1].i or token.head.i < sent[0].i:
-                    for word in doc[sent[0].i-10 : sent[0].i]:
+                    for word in doc[sent[0].i - 10 : sent[0].i]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in sent:
                        print(word.i, word.head.i, word.text, word.dep_)
-                    for word in doc[sent[-1].i : sent[-1].i+10]:
+                    for word in doc[sent[-1].i : sent[-1].i + 10]:
                        print(word.i, word.head.i, word.text, word.dep_)
-                    raise ValueError("Invalid parse: head outside sentence (%s)" % token.text)
-                file_.write(token._.get_conllu_lines(k) + '\n')
-            file_.write('\n')
+                    raise ValueError(
+                        "Invalid parse: head outside sentence (%s)" % token.text
+                    )
+                file_.write(token._.get_conllu_lines(k) + "\n")
+            file_.write("\n")


 def print_progress(itn, losses, ud_scores):
    fields = {
-        'dep_loss': losses.get('parser', 0.0),
-        'tag_loss': losses.get('tagger', 0.0),
-        'words': ud_scores['Words'].f1 * 100,
-        'sents': ud_scores['Sentences'].f1 * 100,
-        'tags': ud_scores['XPOS'].f1 * 100,
-        'uas': ud_scores['UAS'].f1 * 100,
-        'las': ud_scores['LAS'].f1 * 100,
+        "dep_loss": losses.get("parser", 0.0),
+        "tag_loss": losses.get("tagger", 0.0),
+        "words": ud_scores["Words"].f1 * 100,
+        "sents": ud_scores["Sentences"].f1 * 100,
+        "tags": ud_scores["XPOS"].f1 * 100,
+        "uas": ud_scores["UAS"].f1 * 100,
+        "las": ud_scores["LAS"].f1 * 100,
    }
-    header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
+    header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
    if itn == 0:
-        print('\t'.join(header))
-    tpl = '\t'.join((
-        '{:d}',
-        '{dep_loss:.1f}',
-        '{las:.1f}',
-        '{uas:.1f}',
-        '{tags:.1f}',
-        '{sents:.1f}',
-        '{words:.1f}',
-    ))
+        print("\t".join(header))
+    tpl = "\t".join(
+        (
+            "{:d}",
+            "{dep_loss:.1f}",
+            "{las:.1f}",
+            "{uas:.1f}",
+            "{tags:.1f}",
+            "{sents:.1f}",
+            "{words:.1f}",
+        )
+    )
    print(tpl.format(itn, **fields))

-#def get_sent_conllu(sent, sent_id):
+
+# def get_sent_conllu(sent, sent_id):
 #    lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]

+
 def get_token_conllu(token, i):
    if token._.begins_fused:
        n = 1
        while token.nbor(n)._.inside_fused:
            n += 1
-        id_ = '%d-%d' % (i, i+n)
-        lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
+        id_ = "%d-%d" % (i, i + n)
+        lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
    else:
        lines = []
    if token.head.i == token.i:
        head = 0
    else:
        head = i + (token.head.i - token.i) + 1
-    fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
-              str(head), token.dep_.lower(), '_', '_']
-    lines.append('\t'.join(fields))
-    return '\n'.join(lines)
+    fields = [
+        str(i + 1),
+        token.text,
+        token.lemma_,
+        token.pos_,
+        token.tag_,
+        "_",
+        str(head),
+        token.dep_.lower(),
+        "_",
+        "_",
+    ]
+    lines.append("\t".join(fields))
+    return "\n".join(lines)

-Token.set_extension('get_conllu_lines', method=get_token_conllu)
-Token.set_extension('begins_fused', default=False)
-Token.set_extension('inside_fused', default=False)
+
+Token.set_extension("get_conllu_lines", method=get_token_conllu)
+Token.set_extension("begins_fused", default=False)
+Token.set_extension("inside_fused", default=False)


 ##################
@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False)


 def load_nlp(corpus, config, vectors=None):
-    lang = corpus.split('_')[0]
+    lang = corpus.split("_")[0]
    nlp = spacy.blank(lang)
    if config.vectors:
-        if not vectors:     
-            raise ValueError("config asks for vectors, but no vectors "
-                             "directory set on command line (use -v)")
+        if not vectors:
+            raise ValueError(
+                "config asks for vectors, but no vectors "
+                "directory set on command line (use -v)"
+            )
        if (Path(vectors) / corpus).exists():
-            nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab')
-    nlp.meta['treebank'] = corpus
+            nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
+    nlp.meta["treebank"] = corpus
    return nlp
-                                                                            
+

 def initialize_pipeline(nlp, docs, golds, config, device):
-    nlp.add_pipe(nlp.create_pipe('tagger'))
-    nlp.add_pipe(nlp.create_pipe('parser'))
+    nlp.add_pipe(nlp.create_pipe("tagger"))
+    nlp.add_pipe(nlp.create_pipe("parser"))
    if config.multitask_tag:
-        nlp.parser.add_multitask_objective('tag')
+        nlp.parser.add_multitask_objective("tag")
    if config.multitask_sent:
-        nlp.parser.add_multitask_objective('sent_start')
+        nlp.parser.add_multitask_objective("sent_start")
    for gold in golds:
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    if torch is not None and device != -1:
-        torch.set_default_tensor_type('torch.cuda.FloatTensor')
+        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    optimizer = nlp.begin_training(
-        lambda: golds_to_gold_tuples(docs, golds), device=device,
-        subword_features=config.subword_features, conv_depth=config.conv_depth,
-        bilstm_depth=config.bilstm_depth)
+        lambda: golds_to_gold_tuples(docs, golds),
+        device=device,
+        subword_features=config.subword_features,
+        conv_depth=config.conv_depth,
+        bilstm_depth=config.bilstm_depth,
+    )
    if config.pretrained_tok2vec:
        _load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
    return optimizer
@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc):
    """Load pre-trained weights for the 'token-to-vector' part of the component
    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
    """
-    with Path(loc).open('rb') as file_:
+    with Path(loc).open("rb") as file_:
        weights_data = file_.read()
    loaded = []
    for name, component in nlp.pipeline:
-        if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
+        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
            component.tok2vec.from_bytes(weights_data)
            loaded.append(name)
    return loaded


-
 ########################
 # Command line helpers #
 ########################

+
 class Config(object):
-    def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False,
-                multitask_sent=False, multitask_dep=False, multitask_vectors=None,
-                bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750,
-                batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True,
-                vectors_dir=None, pretrained_tok2vec=None):
+    def __init__(
+        self,
+        vectors=None,
+        max_doc_length=10,
+        multitask_tag=False,
+        multitask_sent=False,
+        multitask_dep=False,
+        multitask_vectors=None,
+        bilstm_depth=0,
+        nr_epoch=30,
+        min_batch_size=100,
+        max_batch_size=1000,
+        batch_by_words=True,
+        dropout=0.2,
+        conv_depth=4,
+        subword_features=True,
+        vectors_dir=None,
+        pretrained_tok2vec=None,
+    ):
        if vectors_dir is not None:
            if vectors is None:
                vectors = True
@ -346,13 +396,13 @@ class Config(object):
                multitask_vectors = True
        for key, value in locals().items():
            setattr(self, key, value)
-    
+
    @classmethod
    def load(cls, loc, vectors_dir=None):
-        with Path(loc).open('r', encoding='utf8') as file_:
+        with Path(loc).open("r", encoding="utf8") as file_:
            cfg = json.load(file_)
        if vectors_dir is not None:
-            cfg['vectors_dir'] = vectors_dir
+            cfg["vectors_dir"] = vectors_dir
        return cls(**cfg)


@ -364,43 +414,59 @@ class Dataset(object):
        self.text = None
        for file_path in self.path.iterdir():
            name = file_path.parts[-1]
-            if section in name and name.endswith('conllu'):
+            if section in name and name.endswith("conllu"):
                self.conllu = file_path
-            elif section in name and name.endswith('txt'):
+            elif section in name and name.endswith("txt"):
                self.text = file_path
        if self.conllu is None:
            msg = "Could not find .txt file in {path} for {section}"
            raise IOError(msg.format(section=section, path=path))
        if self.text is None:
            msg = "Could not find .txt file in {path} for {section}"
-        self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
+        self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]


 class TreebankPaths(object):
    def __init__(self, ud_path, treebank, **cfg):
-        self.train = Dataset(ud_path / treebank, 'train')
-        self.dev = Dataset(ud_path / treebank, 'dev')
+        self.train = Dataset(ud_path / treebank, "train")
+        self.dev = Dataset(ud_path / treebank, "dev")
        self.lang = self.train.lang


@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
-    corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
-            "positional", None, str),
+    corpus=(
+        "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+        "positional",
+        None,
+        str,
+    ),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "option", "C", Path),
    limit=("Size limit", "option", "n", int),
    gpu_device=("Use GPU", "option", "g", int),
    use_oracle_segments=("Use oracle segments", "flag", "G", int),
-    vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
-                         "option", "v", Path),
+    vectors_dir=(
+        "Path to directory with pre-trained vectors, named e.g. en/",
+        "option",
+        "v",
+        Path,
+    ),
 )
-def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None,
-        use_oracle_segments=False):
+def main(
+    ud_dir,
+    parses_dir,
+    corpus,
+    config=None,
+    limit=0,
+    gpu_device=-1,
+    vectors_dir=None,
+    use_oracle_segments=False,
+):
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False
-   
+
    if config is not None:
        config = Config.load(config, vectors_dir=vectors_dir)
    else:
@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)

-    docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
-                                        max_doc_length=config.max_doc_length,
-                                        limit=limit)
+    docs, golds = read_data(
+        nlp,
+        paths.train.conllu.open(),
+        paths.train.text.open(),
+        max_doc_length=config.max_doc_length,
+        limit=limit,
+    )

    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)

    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
    beam_prob = compounding(0.2, 0.8, 1.001)
    for i in range(config.nr_epoch):
-        docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
-                                max_doc_length=config.max_doc_length, limit=limit,
-                                oracle_segments=use_oracle_segments,
-                                raw_text=not use_oracle_segments)
+        docs, golds = read_data(
+            nlp,
+            paths.train.conllu.open(),
+            paths.train.text.open(),
+            max_doc_length=config.max_doc_length,
+            limit=limit,
+            oracle_segments=use_oracle_segments,
+            raw_text=not use_oracle_segments,
+        )
        Xs = list(zip(docs, golds))
        random.shuffle(Xs)
        if config.batch_by_words:
@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
            for batch in batches:
                batch_docs, batch_gold = zip(*batch)
                pbar.update(sum(len(doc) for doc in batch_docs))
-                nlp.parser.cfg['beam_update_prob'] = next(beam_prob)
-                nlp.update(batch_docs, batch_gold, sgd=optimizer,
-                           drop=config.dropout, losses=losses)
-        
-        out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
+                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
+                nlp.update(
+                    batch_docs,
+                    batch_gold,
+                    sgd=optimizer,
+                    drop=config.dropout,
+                    losses=losses,
+                )
+
+        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
        with nlp.use_params(optimizer.averages):
            if use_oracle_segments:
-                parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
-                                               paths.dev.conllu, out_path)
+                parsed_docs, scores = evaluate(
+                    nlp, paths.dev.conllu, paths.dev.conllu, out_path
+                )
            else:
-                parsed_docs, scores = evaluate(nlp, paths.dev.text,
-                                               paths.dev.conllu, out_path)
+                parsed_docs, scores = evaluate(
+                    nlp, paths.dev.text, paths.dev.conllu, out_path
+                )
            print_progress(i, losses, scores)


 def _render_parses(i, to_render):
-    to_render[0].user_data['title'] = "Batch %d" % i
-    with Path('/tmp/parses.html').open('w') as file_:
-        html = displacy.render(to_render[:5], style='dep', page=True)
+    to_render[0].user_data["title"] = "Batch %d" % i
+    with Path("/tmp/parses.html").open("w") as file_:
+        html = displacy.render(to_render[:5], style="dep", page=True)
        file_.write(html)


-if __name__ == '__main__':
+if __name__ == "__main__":
    plac.call(main)
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function
 import pkg_resources
 from pathlib import Path
 import sys
-import ujson
 import requests
+from wasabi import Printer

 from ._messages import Messages
-from ..compat import path2str, locale_escape
-from ..util import prints, get_data_path, read_json
+from ..compat import path2str
+from ..util import get_data_path, read_json
 from .. import about


 def validate():
-    """Validate that the currently installed version of spaCy is compatible
+    """
+    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
-    r = requests.get(about.__compatibility__)
-    if r.status_code != 200:
-        prints(Messages.M021, title=Messages.M003.format(code=r.status_code),
-               exits=1)
-    compat = r.json()['spacy']
+    msg = Printer()
+    with msg.loading("Loading compatibility table..."):
+        r = requests.get(about.__compatibility__)
+        if r.status_code != 200:
+            msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1)
+    msg.good("Loaded compatibility table")
+    compat = r.json()["spacy"]
    current_compat = compat.get(about.__version__)
    if not current_compat:
-        prints(about.__compatibility__, exits=1,
-               title=Messages.M022.format(version=about.__version__))
+        msg.fail(
+            Messages.M022.format(version=about.__version__),
+            about.__compatibility__,
+            exits=1,
+        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
@ -33,33 +39,38 @@ def validate():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
-    incompat_links = {l for l, d in model_links.items() if not d['compat']}
-    incompat_models = {d['name'] for _, d in model_pkgs.items()
-                       if not d['compat']}
-    incompat_models.update([d['name'] for _, d in model_links.items()
-                            if not d['compat']])
+    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
+    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
+    incompat_models.update(
+        [d["name"] for _, d in model_links.items() if not d["compat"]]
+    )
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
+    spacy_dir = Path(__file__).parent.parent
+
+    msg.divider(Messages.M023.format(version=about.__version__))
+    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

-    prints(path2str(Path(__file__).parent.parent),
-           title=Messages.M023.format(version=about.__version__))
    if model_links or model_pkgs:
-        print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
+        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
+        rows = []
        for name, data in model_pkgs.items():
-            print(get_model_row(current_compat, name, data, 'package'))
+            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
-            print(get_model_row(current_compat, name, data, 'link'))
+            rows.append(get_model_row(current_compat, name, data, msg, "link"))
+        msg.table(rows, header=header)
    else:
-        prints(Messages.M024, exits=0)
+        msg.text(Messages.M024, exits=0)
    if update_models:
-        cmd = '    python -m spacy download {}'
-        print("\n    " + Messages.M025)
-        print('\n'.join([cmd.format(pkg) for pkg in update_models]))
+        msg.divider("Install updates")
+        cmd = "python -m spacy download {}"
+        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
-        prints(Messages.M025.format(version=about.__version__,
-                                    models=', '.join(na_models)))
+        msg.text(
+            Messages.M025.format(version=about.__version__, models=", ".join(na_models))
+        )
    if incompat_links:
-        prints(Messages.M027.format(path=path2str(get_data_path())))
+        msg.text(Messages.M027.format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)

@ -70,50 +81,48 @@ def get_model_links(compat):
    if data_path:
        models = [p for p in data_path.iterdir() if is_model_path(p)]
        for model in models:
-            meta_path = Path(model) / 'meta.json'
+            meta_path = Path(model) / "meta.json"
            if not meta_path.exists():
                continue
            meta = read_json(meta_path)
            link = model.parts[-1]
-            name = meta['lang'] + '_' + meta['name']
-            links[link] = {'name': name, 'version': meta['version'],
-                           'compat': is_compat(compat, name, meta['version'])}
+            name = meta["lang"] + "_" + meta["name"]
+            links[link] = {
+                "name": name,
+                "version": meta["version"],
+                "compat": is_compat(compat, name, meta["version"]),
+            }
    return links


 def get_model_pkgs(compat, all_models):
    pkgs = {}
    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
-        package = pkg_name.replace('-', '_')
+        package = pkg_name.replace("-", "_")
        if package in all_models:
            version = pkg_data.version
-            pkgs[pkg_name] = {'name': package, 'version': version,
-                              'compat': is_compat(compat, package, version)}
+            pkgs[pkg_name] = {
+                "name": package,
+                "version": version,
+                "compat": is_compat(compat, package, version),
+            }
    return pkgs


-def get_model_row(compat, name, data, type='package'):
-    tpl_red = '\x1b[38;5;1m{}\x1b[0m'
-    tpl_green = '\x1b[38;5;2m{}\x1b[0m'
-    if data['compat']:
-        comp = tpl_green.format(locale_escape('✔', errors='ignore'))
-        version = tpl_green.format(data['version'])
+def get_model_row(compat, name, data, msg, model_type="package"):
+    if data["compat"]:
+        comp = msg.text("", color="green", icon="good", no_print=True)
+        version = msg.text(data["version"], color="green", no_print=True)
    else:
-        comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
-        version = tpl_red.format(data['version'])
-    return get_row(type, name, data['name'], version, comp)
-
-
-def get_row(*args):
-    tpl_row = '    {:<10}' + ('  {:<20}' * 4)
-    return tpl_row.format(*args)
+        version = msg.text(data["version"], color="red", no_print=True)
+        comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
+    return (model_type, name, data["name"], version, comp)


 def is_model_path(model_path):
-    exclude = ['cache', 'pycache', '__pycache__']
+    exclude = ["cache", "pycache", "__pycache__"]
    name = model_path.parts[-1]
-    return (model_path.is_dir() and name not in exclude
-            and not name.startswith('.'))
+    return model_path.is_dir() and name not in exclude and not name.startswith(".")


 def is_compat(compat, name, version):
@ -122,6 +131,6 @@ def is_compat(compat, name, version):

 def reformat_version(version):
    """Hack to reformat old versions ending on '-alpha' to match pip format."""
-    if version.endswith('-alpha'):
-        return version.replace('-alpha', 'a0')
-    return version.replace('-alpha', 'a')
+    if version.endswith("-alpha"):
+        return version.replace("-alpha", "a0")
+    return version.replace("-alpha", "a")
--- a/spacy/cli/vocab.py
+++ b/spacy/cli/vocab.py
@ -1,59 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-import plac
-import json
-import spacy
-import numpy
-from pathlib import Path
-
-from ..vectors import Vectors
-from ..util import prints, ensure_path
-
-
-@plac.annotations(
-    lang=("model language", "positional", None, str),
-    output_dir=("model output directory", "positional", None, Path),
-    lexemes_loc=("location of JSONL-formatted lexical data", "positional",
-                 None, Path),
-    vectors_loc=("optional: location of vectors data, as numpy .npz",
-                 "positional", None, str),
-    prune_vectors=("optional: number of vectors to prune to.",
-                   "option", "V", int)
-)
-def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
-    """Compile a vocabulary from a lexicon jsonl file and word vectors."""
-    if not lexemes_loc.exists():
-        prints(lexemes_loc, title="Can't find lexical data", exits=1)
-    vectors_loc = ensure_path(vectors_loc)
-    nlp = spacy.blank(lang)
-    for word in nlp.vocab:
-        word.rank = 0
-    lex_added = 0
-    with lexemes_loc.open() as file_:
-        for line in file_:
-            if line.strip():
-                attrs = json.loads(line)
-                if 'settings' in attrs:
-                    nlp.vocab.cfg.update(attrs['settings'])
-                else:
-                    lex = nlp.vocab[attrs['orth']]
-                    lex.set_attrs(**attrs)
-                    assert lex.rank == attrs['id']
-                lex_added += 1
-    if vectors_loc is not None:
-        vector_data = numpy.load(vectors_loc.open('rb'))
-        nlp.vocab.vectors = Vectors(data=vector_data)
-        for word in nlp.vocab:
-            if word.rank:
-                nlp.vocab.vectors.add(word.orth, row=word.rank)
-
-    if prune_vectors >= 1:
-        remap = nlp.vocab.prune_vectors(prune_vectors)
-    if not output_dir.exists():
-        output_dir.mkdir()
-    nlp.to_disk(output_dir)
-    vec_added = len(nlp.vocab.vectors)
-    prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
-           title="Sucessfully compiled vocab and vectors, and saved model")
-    return nlp
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -5,7 +5,6 @@ import os
 import sys
 import ujson
 import itertools
-import locale

 from thinc.neural.util import copy_array

@ -136,12 +135,3 @@ def import_file(name, loc):
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module
-
-
-def locale_escape(string, errors="replace"):
-    """
-    Mangle non-supported characters, for savages with ascii terminals.
-    """
-    encoding = locale.getpreferredencoding()
-    string = string.encode(encoding, errors).decode("utf8")
-    return string
--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -5,7 +5,7 @@ from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc, Span
 from ..compat import b_to_str
 from ..errors import Errors, Warnings, user_warning
-from ..util import prints, is_in_jupyter
+from ..util import is_in_jupyter


 _html = {}
@ -72,14 +72,12 @@ def serve(

    render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
    httpd = simple_server.make_server("0.0.0.0", port, app)
-    prints(
-        "Using the '{}' visualizer".format(style),
-        title="Serving on port {}...".format(port),
-    )
+    print("\nUsing the '{}' visualizer".format(style))
+    print("Serving on port {}...\n".format(port))
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
-        prints("Shutting down server on port {}.".format(port))
+        print("Shutting down server on port {}.".format(port))
    finally:
        httpd.server_close()

--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -278,6 +278,12 @@ class Errors(object):
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
            " can only be part of one entity, so make sure the entities you're "
            "setting don't overlap.")
+    E104 = ("Can't find JSON schema for '{name}'.")
+    E105 = ("The Doc.print_tree() method is now deprecated. Please use "
+            "Doc.json() instead.")
+    E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
+            "settings: {opts}")
+    E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")


@add_codes
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -15,7 +15,7 @@ import json

 import ujson

-from . import _align 
+from . import _align
 from .syntax import nonproj
 from .tokens import Doc
 from .errors import Errors
@ -172,7 +172,7 @@ class GoldCorpus(object):
    def dev_tuples(self):
        locs = (self.tmp_dir / 'dev').iterdir()
        yield from self.read_tuples(locs, limit=self.limit)
-   
+
    @property
    def train_tuples(self):
        locs = (self.tmp_dir / 'train').iterdir()
@ -271,6 +271,53 @@ def _corrupt(c, noise_level):
        return c.lower()


+def read_json_object(json_corpus_section):
+    """Take a list of JSON-formatted documents (e.g. from an already loaded
+    training data file) and yield tuples in the GoldParse format.
+
+    json_corpus_section (list): The data.
+    YIELDS (tuple): The reformatted data.
+    """
+    for json_doc in json_corpus_section:
+        tuple_doc = json_to_tuple(json_doc)
+        for tuple_paragraph in tuple_doc:
+            yield tuple_paragraph
+
+
+def json_to_tuple(doc):
+    """Convert an item in the JSON-formatted training data to the tuple format
+    used by GoldParse.
+
+    doc (dict): One entry in the training data.
+    YIELDS (tuple): The reformatted data.
+    """
+    paragraphs = []
+    for paragraph in doc['paragraphs']:
+        sents = []
+        for sent in paragraph['sentences']:
+            words = []
+            ids = []
+            tags = []
+            heads = []
+            labels = []
+            ner = []
+            for i, token in enumerate(sent['tokens']):
+                words.append(token['orth'])
+                ids.append(i)
+                tags.append(token.get('tag', '-'))
+                heads.append(token.get('head', 0) + i)
+                labels.append(token.get('dep', ''))
+                # Ensure ROOT label is case-insensitive
+                if labels[-1].lower() == 'root':
+                    labels[-1] = 'ROOT'
+                ner.append(token.get('ner', '-'))
+            sents.append([
+                [ids, words, tags, heads, labels, ner],
+                sent.get('brackets', [])])
+        if sents:
+            yield [paragraph.get('raw', None), sents]
+
+
 def read_json_file(loc, docs_filter=None, limit=None):
    loc = util.ensure_path(loc)
    if loc.is_dir():
@ -280,31 +327,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
        for doc in _json_iterate(loc):
            if docs_filter is not None and not docs_filter(doc):
                continue
-            paragraphs = []
-            for paragraph in doc['paragraphs']:
-                sents = []
-                for sent in paragraph['sentences']:
-                    words = []
-                    ids = []
-                    tags = []
-                    heads = []
-                    labels = []
-                    ner = []
-                    for i, token in enumerate(sent['tokens']):
-                        words.append(token['orth'])
-                        ids.append(i)
-                        tags.append(token.get('tag', '-'))
-                        heads.append(token.get('head', 0) + i)
-                        labels.append(token.get('dep', ''))
-                        # Ensure ROOT label is case-insensitive
-                        if labels[-1].lower() == 'root':
-                            labels[-1] = 'ROOT'
-                        ner.append(token.get('ner', '-'))
-                    sents.append([
-                        [ids, words, tags, heads, labels, ner],
-                        sent.get('brackets', [])])
-                if sents:
-                    yield [paragraph.get('raw', None), sents]
+            for json_tuple in json_to_tuple(doc):
+                yield json_tuple


 def _json_iterate(loc):
@ -573,32 +597,19 @@ cdef class GoldParse:
                        self.c.sent_start[i] = 0


-def docs_to_json(id, docs):
-    '''Convert a list of Doc objects into the JSON-serializable format used by
-    the spacy train command. Each Doc in the list will be interpreted as a
-    paragraph.
-    '''
+def docs_to_json(docs, underscore=None):
+    """Convert a list of Doc objects into the JSON-serializable format used by
+    the spacy train command.
+
+    docs (iterable / Doc): The Doc object(s) to convert.
+    underscore (list): Optional list of string names of custom doc._.
+        attributes. Attribute values need to be JSON-serializable. Values will
+        be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
+    RETURNS (list): The data in spaCy's JSON format.
+    """
    if isinstance(docs, Doc):
        docs = [docs]
-    json_doc = {'id': id, 'paragraphs': []}
-    for i, doc in enumerate(docs):
-        json_para = {'raw': doc.text, 'sentences': []}
-        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
-        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
-        for j, sent in enumerate(doc.sents):
-            json_sent = {'tokens': [], 'brackets': []}
-            for token in sent:
-                json_token = {"id": token.i, "orth": token.text}
-                if doc.is_tagged:
-                    json_token['tag'] = token.tag_
-                if doc.is_parsed:
-                    json_token['head'] = token.head.i-token.i
-                    json_token['dep'] = token.dep_
-                json_token['ner'] = biluo_tags[token.i]
-                json_sent['tokens'].append(json_token)
-            json_para['sentences'].append(json_sent)
-        json_doc['paragraphs'].append(json_para)
-    return json_doc
+    return [doc.to_json(underscore=underscore) for doc in docs]


 def biluo_tags_from_offsets(doc, entities, missing='O'):
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -341,21 +341,3 @@ def test_lowest_common_ancestor(en_tokenizer):
    assert lca[1, 1] == 1
    assert lca[0, 1] == 2
    assert lca[1, 2] == 2
-
-
-def test_parse_tree(en_tokenizer):
-    """Tests doc.print_tree() method."""
-    text = "I like New York in Autumn."
-    heads = [1, 0, 1, -2, -3, -1, -5]
-    tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
-    # full method parse_tree(text) is a trivial composition
-    trees = doc.print_tree()
-    assert len(trees) > 0
-    tree = trees[0]
-    assert all(
-        k in list(tree.keys())
-        for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"]
-    )
-    assert tree["word"] == "like"  # check root is correct
--- a/spacy/tests/doc/test_to_json.py
+++ b/spacy/tests/doc/test_to_json.py
@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.cli.schemas import get_schema, validate_json
+from spacy.tokens import Doc
+from ..util import get_doc
+
+
+@pytest.fixture()
+def doc(en_vocab):
+    words = ["c", "d", "e"]
+    pos = ["VERB", "NOUN", "NOUN"]
+    tags = ["VBP", "NN", "NN"]
+    heads = [0, -1, -2]
+    deps = ["ROOT", "dobj", "dobj"]
+    ents = [(1, 2, "ORG")]
+    return get_doc(
+        en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
+    )
+
+
+def test_doc_to_json(doc):
+    json_doc = doc.to_json()
+    assert json_doc["text"] == "c d e "
+    assert len(json_doc["tokens"]) == 3
+    assert json_doc["tokens"][0]["pos"] == "VERB"
+    assert json_doc["tokens"][0]["tag"] == "VBP"
+    assert json_doc["tokens"][0]["dep"] == "ROOT"
+    assert len(json_doc["ents"]) == 1
+    assert json_doc["ents"][0]["start"] == 2  # character offset!
+    assert json_doc["ents"][0]["end"] == 3  # character offset!
+    assert json_doc["ents"][0]["label"] == "ORG"
+
+
+def test_doc_to_json_underscore(doc):
+    Doc.set_extension("json_test1", default=False)
+    Doc.set_extension("json_test2", default=False)
+    doc._.json_test1 = "hello world"
+    doc._.json_test2 = [1, 2, 3]
+    json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
+    assert "_" in json_doc
+    assert json_doc["_"]["json_test1"] == "hello world"
+    assert json_doc["_"]["json_test2"] == [1, 2, 3]
+
+
+def test_doc_to_json_underscore_error_attr(doc):
+    """Test that Doc.to_json() raises an error if a custom attribute doesn't
+    exist in the ._ space."""
+    with pytest.raises(ValueError):
+        doc.to_json(underscore=["json_test3"])
+
+
+def test_doc_to_json_underscore_error_serialize(doc):
+    """Test that Doc.to_json() raises an error if a custom attribute value
+    isn't JSON-serializable."""
+    Doc.set_extension("json_test4", method=lambda doc: doc.text)
+    with pytest.raises(ValueError):
+        doc.to_json(underscore=["json_test4"])
+
+
+def test_doc_to_json_valid_training(doc):
+    json_doc = doc.to_json()
+    errors = validate_json([json_doc], get_schema("training"))
+    assert not errors
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -3,7 +3,6 @@ from __future__ import unicode_literals

 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
-
 from ..util import get_doc


--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -2,9 +2,7 @@
 from __future__ import unicode_literals

 from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import docs_to_json
 from spacy.tokens import Doc
-from .util import get_doc


 def test_gold_biluo_U(en_vocab):
@ -52,34 +50,3 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets
-
-
-def test_docs_to_json(en_vocab):
-    """Test we can convert a list of Doc objects into the JSON-serializable
-    format we use for training.
-    """
-    docs = [
-        get_doc(
-            en_vocab,
-            words=["a", "b"],
-            pos=["VBP", "NN"],
-            heads=[0, -1],
-            deps=["ROOT", "dobj"],
-            ents=[],
-        ),
-        get_doc(
-            en_vocab,
-            words=["c", "d", "e"],
-            pos=["VBP", "NN", "NN"],
-            heads=[0, -1, -2],
-            deps=["ROOT", "dobj", "dobj"],
-            ents=[(1, 2, "ORG")],
-        ),
-    ]
-    json_doc = docs_to_json(0, docs)
-    assert json_doc["id"] == 0
-    assert len(json_doc["paragraphs"]) == 2
-    assert len(json_doc["paragraphs"][0]["sentences"]) == 1
-    assert len(json_doc["paragraphs"][1]["sentences"]) == 1
-    assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2
-    assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3
--- a/spacy/tests/test_json_schemas.py
+++ b/spacy/tests/test_json_schemas.py
@ -0,0 +1,44 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from spacy.cli.schemas import validate_json, get_schema
+import pytest
+
+
+@pytest.fixture(scope="session")
+def training_schema():
+    return get_schema("training")
+
+
+def test_json_schema_get():
+    schema = get_schema("training")
+    assert schema
+    with pytest.raises(ValueError):
+        schema = get_schema("xxx")
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"text": "Hello world"},
+        {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
+    ],
+)
+def test_json_schema_training_valid(data, training_schema):
+    errors = validate_json([data], training_schema)
+    assert not errors
+
+
+@pytest.mark.parametrize(
+    "data,n_errors",
+    [
+        ({"spans": []}, 1),
+        ({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2),
+        ({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1),
+        ({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1),
+        ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
+    ],
+)
+def test_json_schema_training_invalid(data, n_errors, training_schema):
+    errors = validate_json([data], training_schema)
+    assert len(errors) == n_errors
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals

-
 import pytest
 from pathlib import Path
 from spacy import util
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -20,7 +20,6 @@ from .span cimport Span
 from .token cimport Token
 from .span cimport Span
 from .token cimport Token
-from .printers import parse_tree
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs import intify_attrs, IDS
@ -29,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
 from ..attrs cimport ENT_TYPE, SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
-from ..util import normalize_slice
+from ..util import normalize_slice, is_json_serializable
 from ..compat import is_config, copy_reg, pickle, basestring_
 from ..errors import deprecation_warning, models_warning, user_warning
 from ..errors import Errors, Warnings
@ -959,31 +958,48 @@ cdef class Doc:
        return self[start]

    def print_tree(self, light=False, flat=False):
-        """Returns the parse trees in JSON (dict) format.
+        raise ValueError(Errors.E105)

-        light (bool): Don't include lemmas or entities.
-        flat (bool): Don't include arcs or modifiers.
-        RETURNS (dict): Parse tree as dict.
+    def to_json(self, underscore=None):
+        """Convert a Doc to JSON. Produces the same format used by the spacy
+        train command.

-        EXAMPLE:
-            >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
-            >>> trees = doc.print_tree()
-            >>> trees[1]
-            {'modifiers': [
-                {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
-                'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
-                'lemma': 'Alice'},
-                {'modifiers': [
-                    {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
-                    'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
-                'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
-                'POS_fine': 'NN', 'lemma': 'pizza'},
-                {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
-                'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
-                'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
-                'POS_fine': 'VBD', 'lemma': 'eat'}
+        underscore (list): Optional list of string names of custom doc._.
+        attributes. Attribute values need to be JSON-serializable. Values will
+        be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
+        RETURNS (dict): The data in spaCy's JSON format.
        """
-        return parse_tree(self, light=light, flat=flat)
+        data = {'text': self.text}
+        data['ents'] = [{'start': ent.start_char, 'end': ent.end_char,
+                         'label': ent.label_} for ent in self.ents]
+        sents = list(self.sents)
+        if sents:
+            data['sents'] = [{'start': sent.start_char, 'end': sent.end_char}
+                             for sent in sents]
+        if self.cats:
+            data['cats'] = self.cats
+        data['tokens'] = []
+        for token in self:
+            token_data = {'id': token.i, 'start': token.idx, 'end': token.idx + len(token)}
+            if token.pos_:
+                token_data['pos'] = token.pos_
+            if token.tag_:
+                token_data['tag'] = token.tag_
+            if token.dep_:
+                token_data['dep'] = token.dep_
+            if token.head:
+                token_data['head'] = token.head.i
+            data['tokens'].append(token_data)
+        if underscore:
+            data['_'] = {}
+            for attr in underscore:
+                if not self.has_extension(attr):
+                    raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
+                value = self._.get(attr)
+                if not is_json_serializable(value):
+                    raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
+                data['_'][attr] = value
+        return data


 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
--- a/spacy/tokens/printers.py
+++ b/spacy/tokens/printers.py
@ -1,74 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from .doc import Doc
-from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
-
-
-def merge_ents(doc):
-    """Helper: merge adjacent entities into single tokens; modifies the doc."""
-    for ent in doc.ents:
-        ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.label_)
-    return doc
-
-
-def format_POS(token, light, flat):
-    """Helper: form the POS output for a token."""
-    subtree = dict([
-        ("word", token.text),
-        ("lemma", token.lemma_),  # trigger
-        ("NE", token.ent_type_),  # trigger
-        ("POS_fine", token.tag_),
-        ("POS_coarse", token.pos_),
-        ("arc", token.dep_),
-        ("modifiers", [])
-    ])
-    if light:
-        subtree.pop("lemma")
-        subtree.pop("NE")
-    if flat:
-        subtree.pop("arc")
-        subtree.pop("modifiers")
-    return subtree
-
-
-def POS_tree(root, light=False, flat=False):
-    """Helper: generate a POS tree for a root token. The doc must have
-    `merge_ents(doc)` ran on it.
-    """
-    subtree = format_POS(root, light=light, flat=flat)
-    for c in root.children:
-        subtree["modifiers"].append(POS_tree(c))
-    return subtree
-
-
-def parse_tree(doc, light=False, flat=False):
-    """Make a copy of the doc and construct a syntactic parse tree similar to
-    displaCy. Generates the POS tree for all sentences in a doc.
-
-    doc (Doc): The doc for parsing.
-    RETURNS (dict): The parse tree.
-
-    EXAMPLE:
-        >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
-        >>> trees = doc.print_tree()
-        >>> trees[1]
-        {'modifiers': [
-            {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
-             'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
-            {'modifiers': [
-                {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
-                 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
-             'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
-             'POS_fine': 'NN', 'lemma': 'pizza'},
-            {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
-             'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
-            'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
-            'POS_fine': 'VBD', 'lemma': 'eat'}
-    """
-    doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
-    doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
-                         doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
-    merge_ents(doc_clone)  # merge the entities into single tokens first
-    return [POS_tree(sent.root, light=light, flat=flat)
-            for sent in doc_clone.sents]
--- a/spacy/util.py
+++ b/spacy/util.py
@ -7,8 +7,6 @@ import pkg_resources
 import importlib
 import regex as re
 from pathlib import Path
-import sys
-import textwrap
 import random
 from collections import OrderedDict
 from thinc.neural._classes.model import Model
@ -18,9 +16,10 @@ import cytoolz
 import itertools
 import numpy.random

+
 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
-from .compat import import_file
+from .compat import import_file, json_dumps
 from .errors import Errors

 # Import these directly from Thinc, so that we're sure we always have the
@ -541,6 +540,16 @@ def read_json(location):
        return ujson.load(f)


+def write_json(file_path, contents):
+    """Create a .json file and dump contents.
+
+    file_path (unicode / Path): The path to the output file.
+    contents: The JSON-serializable contents to output.
+    """
+    with Path(file_path).open("w", encoding="utf8") as f:
+        f.write(json_dumps(contents))
+
+
 def read_jsonl(file_path):
    """Read a .jsonl file and yield its contents line by line.

@ -555,6 +564,29 @@ def read_jsonl(file_path):
                continue


+def write_jsonl(file_path, lines):
+    """Create a .jsonl file and dump contents.
+
+    file_path (unicode / Path): The path to the output file.
+    lines (list): The JSON-serializable contents of each line.
+    """
+    data = [json_dumps(line) for line in lines]
+    with Path(file_path).open("w", encoding="utf-8") as f:
+        f.write("\n".join(data))
+
+
+def is_json_serializable(obj):
+    """Check if a Python object is JSON-serializable."""
+    if hasattr(obj, "__call__"):
+        # Check this separately here to prevent infinite recursions
+        return False
+    try:
+        ujson.dumps(obj)
+        return True
+    except TypeError:
+        return False
+
+
 def get_raw_input(description, default=False):
    """Get user input from the command line via raw_input / input.

@ -602,21 +634,6 @@ def from_disk(path, readers, exclude):
    return path


-def print_table(data, title=None):
-    """Print data in table format.
-
-    data (dict or list of tuples): Label/value pairs.
-    title (unicode or None): Title, will be printed above.
-    """
-    if isinstance(data, dict):
-        data = list(data.items())
-    tpl_row = "    {:<15}" * len(data[0])
-    table = "\n".join([tpl_row.format(l, unicode_(v)) for l, v in data])
-    if title:
-        print("\n    \033[93m{}\033[0m".format(title))
-    print("\n{}\n".format(table))
-
-
 def print_markdown(data, title=None):
    """Print data in GitHub-flavoured Markdown format for issues etc.

@ -638,44 +655,6 @@ def print_markdown(data, title=None):
    print("\n{}\n".format("\n".join(markdown)))


-def prints(*texts, **kwargs):
-    """Print formatted message (manual ANSI escape sequences to avoid
-    dependency)
-
-    *texts (unicode): Texts to print. Each argument is rendered as paragraph.
-    **kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
-    """
-    exits = kwargs.get("exits", None)
-    title = kwargs.get("title", None)
-    title = "\033[93m{}\033[0m\n".format(_wrap(title)) if title else ""
-    message = "\n\n".join([_wrap(text) for text in texts])
-    print("\n{}{}\n".format(title, message))
-    if exits is not None:
-        sys.exit(exits)
-
-
-def _wrap(text, wrap_max=80, indent=4):
-    """Wrap text at given width using textwrap module.
-
-    text (unicode): Text to wrap. If it's a Path, it's converted to string.
-    wrap_max (int): Maximum line length (indent is deducted).
-    indent (int): Number of spaces for indentation.
-    RETURNS (unicode): Wrapped text.
-    """
-    indent = indent * " "
-    wrap_width = wrap_max - len(indent)
-    if isinstance(text, Path):
-        text = path2str(text)
-    return textwrap.fill(
-        text,
-        width=wrap_width,
-        initial_indent=indent,
-        subsequent_indent=indent,
-        break_long_words=False,
-        break_on_hyphens=False,
-    )
-
-
 def minify_html(html):
    """Perform a template-specific, rudimentary HTML minification for displaCy.
    Disclaimer: NOT a general-purpose solution, only removes indentation and
--- a/website/api/_top-level/_util.jade
+++ b/website/api/_top-level/_util.jade
@ -320,37 +320,6 @@ p
        +cell dict
        +cell Combined tokenizer exceptions.

-
-+h(3, "util.prints") util.prints
-    +tag function
-    +tag-new(2)
-
-p
-    |  Print a formatted, text-wrapped message with optional title. If a text
-    |  argument is a #[code Path], it's converted to a string. Should only
-    |  be used for interactive components like the command-line interface.
-
-+aside-code("Example").
-    data_path = Path('/some/path')
-    if not path.exists():
-        util.prints("Can't find the path.", data_path,
-                    title="Error", exits=1)
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code *texts]
-        +cell unicode
-        +cell Texts to print. Each argument is rendered as paragraph.
-
-    +row
-        +cell #[code **kwargs]
-        +cell -
-        +cell
-            |  #[code title] is rendered as coloured headline. #[code exits]
-            |  performs system exit after printing, using the value of the
-            |  argument as the exit code, e.g. #[code exits=1].
-
-
 +h(3, "util.minibatch") util.minibatch
    +tag function
    +tag-new(2)
--- a/website/api/cli.jade
+++ b/website/api/cli.jade
@ -257,10 +257,19 @@ p
    |  to allow packaging the model using the
    |  #[+api("cli#package") #[code package]] command.

+infobox("Changed in v2.1", "⚠️")
+    |  As of spaCy 2.1, the #[code --no-tagger], #[code --no-parser] and
+    |  #[code --no-parser] flags have been replaced by a #[code --pipeline]
+    |  option, which lets you define comma-separated names of pipeline
+    |  components to train. For example, #[code --pipeline tagger,parser] will
+    |  only train the tagger and parser.
+
 +code(false, "bash", "$", false, false, true).
-    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
-    [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
-    [--no-entities] [--gold-preproc] [--verbose]
+    python -m spacy train [lang] [output_path] [train_path] [dev_path]
+    [--base-model] [--pipeline] [--vectors] [--n-iter] [--n-examples] [--use-gpu]
+    [--version] [--meta-path] [--init-tok2vec] [--parser-multitasks]
+    [--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens]
+    [--verbose]

 +table(["Argument", "Type", "Description"])
    +row
@ -269,34 +278,34 @@ p
        +cell Model language.

    +row
-        +cell #[code output_dir]
+        +cell #[code output_path]
        +cell positional
-        +cell Directory to store model in.
+        +cell Directory to store model in. Will be created if it doesn't exist.

    +row
-        +cell #[code train_data]
+        +cell #[code train_path]
        +cell positional
        +cell Location of JSON-formatted training data.

    +row
-        +cell #[code dev_data]
+        +cell #[code dev_path]
        +cell positional
        +cell Location of JSON-formatted development data for evaluation.

    +row
-        +cell #[code --n-iter], #[code -n]
+        +cell #[code --base-model], #[code -b]
        +cell option
-        +cell Number of iterations (default: #[code 30]).
+        +cell
+            |  Optional name of base model to update. Can be any loadable
+            |  spaCy model.

    +row
-        +cell #[code --n-sents], #[code -ns]
+        +cell #[code --pipeline], #[code -p]
+            +tag-new("2.1.0")
        +cell option
-        +cell Number of sentences (default: #[code 0]).
-
-    +row
-        +cell #[code --use-gpu], #[code -g]
-        +cell option
-        +cell Use GPU.
+        +cell
+            |  Comma-separated names of pipeline components to train. Defaults
+            |  to #[code 'tagger,parser,ner'].

    +row
        +cell #[code --vectors], #[code -v]
@ -304,13 +313,21 @@ p
        +cell Model to load vectors from.

    +row
-        +cell #[code --meta-path], #[code -m]
+        +cell #[code --n-iter], #[code -n]
+        +cell option
+        +cell Number of iterations (default: #[code 30]).
+
+    +row
+        +cell #[code --n-examples], #[code -ns]
+        +cell option
+        +cell Number of examples to use (defaults to #[code 0] for all examples).
+
+    +row
+        +cell #[code --use-gpu], #[code -g]
        +cell option
        +cell
-            |  #[+tag-new(2)] Optional path to model
-            |  #[+a("/usage/training#models-generating") #[code meta.json]].
-            |  All relevant properties like #[code lang], #[code pipeline] and
-            |  #[code spacy_version] will be overwritten.
+            |  Whether to use GPU. Can be either #[code 0], #[code 1] or
+            |  #[code -1].

    +row
        +cell #[code --version], #[code -V]
@ -320,40 +337,69 @@ p
            |  #[code meta.json] after training.

    +row
-        +cell #[code --no-tagger], #[code -T]
-        +cell flag
-        +cell Don't train tagger.
+        +cell #[code --meta-path], #[code -m]
+            +tag-new(2)
+        +cell option
+        +cell
+            |  Optional path to model
+            |  #[+a("/usage/training#models-generating") #[code meta.json]].
+            |  All relevant properties like #[code lang], #[code pipeline] and
+            |  #[code spacy_version] will be overwritten.

    +row
-        +cell #[code --no-parser], #[code -P]
-        +cell flag
-        +cell Don't train parser.
+        +cell #[code --init-tok2vec], #[code -t2v]
+            +tag-new("2.1.0")
+        +cell option
+        +cell
+            |  Path to pretrained weights for the token-to-vector parts of the
+            |  models. See #[code spacy pretrain]. Experimental.

    +row
-        +cell #[code --no-entities], #[code -N]
-        +cell flag
-        +cell Don't train NER.
+        +cell #[code --parser-multitasks], #[code -pt]
+        +cell option
+        +cell
+            |  Side objectives for parser CNN, e.g. #[code 'dep'] or
+            |  #[code 'dep,tag']
+
+    +row
+        +cell #[code --entity-multitasks], #[code -et]
+        +cell option
+        +cell
+            |  Side objectives for NER CNN, e.g. #[code 'dep'] or
+            |  #[code 'dep,tag']
+
+    +row
+        +cell #[code --noise-level], #[code -nl]
+        +cell option
+        +cell Float indicating the amount of corruption for data agumentation.

    +row
        +cell #[code --gold-preproc], #[code -G]
        +cell flag
        +cell Use gold preprocessing.

+    +row
+        +cell #[code --learn-tokens], #[code -T]
+        +cell flag
+        +cell
+            |  Make parser learn gold-standard tokenization by merging
+            ]  subtokens. Typically used for languages like Chinese.
+
+    +row
+        +cell #[code --verbose], #[code -VV]
+            +tag-new("2.0.13")
+        +cell flag
+        +cell Show more detailed messages during training.
+
    +row
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.

-    +row
-        +cell #[code --verbose]
-            +tag-new("2.0.13")
-        +cell flag
-        +cell Show more detail message during training.
-
    +row("foot")
        +cell creates
        +cell model, pickle
-        +cell A spaCy model on each epoch, and a final #[code .pickle] file.
+        +cell A spaCy model on each epoch.

 +h(4, "train-hyperparams") Environment variables for hyperparameters
    +tag-new(2)