💫 New system for error messages and warnings (#2163)

* Add spacy.errors module * Update deprecation and user warnings * Replace errors and asserts with new error message system * Remove redundant asserts * Fix whitespace * Add messages for print/util.prints statements * Fix typo * Fix typos * Move CLI messages to spacy.cli._messages * Add decorator to display error code with message An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc. * Remove unused link in spacy.about * Update errors for invalid pipeline components * Improve error for unknown factories * Add displaCy warnings * Update formatting consistency * Move error message to spacy.errors * Update errors and check if doc returned by component is None
2025-12-23 01:53:17 +03:00 · 2018-04-03 15:50:31 +02:00 · 2018-04-03 15:50:31 +02:00 · 3141e04822
commit 3141e04822
parent abf8b16d71
41 changed files with 652 additions and 443 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -4,18 +4,14 @@ from __future__ import unicode_literals
 from .cli.info import info as cli_info
 from .glossary import explain
 from .about import __version__
+from .errors import Warnings, deprecation_warning
 from . import util


 def load(name, **overrides):
    depr_path = overrides.get('path')
    if depr_path not in (True, False, None):
-        util.deprecated(
-            "As of spaCy v2.0, the keyword argument `path=` is deprecated. "
-            "You can now call spacy.load with the path as its first argument, "
-            "and the model's meta.json will be used to determine the language "
-            "to load. For example:\nnlp = spacy.load('{}')".format(depr_path),
-            'error')
+        deprecation_warning(Warnings.W001.format(path=depr_path))
    return util.load_model(name, **overrides)


--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -23,6 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
 import thinc.extra.load_nlp

 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
+from .errors import Errors
 from . import util


@ -340,10 +341,10 @@ def _divide_array(X, size):


 def get_col(idx):
-    assert idx >= 0, idx
+    if idx < 0:
+        raise IndexError(Errors.E066.format(value=idx))

    def forward(X, drop=0.):
-        assert idx >= 0, idx
        if isinstance(X, numpy.ndarray):
            ops = NumpyOps()
        else:
@ -351,7 +352,6 @@ def get_col(idx):
        output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)

        def backward(y, sgd=None):
-            assert idx >= 0, idx
            dX = ops.allocate(X.shape)
            dX[:, idx] += y
            return dX
--- a/spacy/about.py
+++ b/spacy/about.py
@ -11,7 +11,6 @@ __email__ = 'contact@explosion.ai'
 __license__ = 'MIT'
 __release__ = True

-__docs_models__ = 'https://spacy.io/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
 __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
 __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'
--- a/spacy/cli/_messages.py
+++ b/spacy/cli/_messages.py
@ -0,0 +1,73 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+class Messages(object):
+    M001 = ("Download successful but linking failed")
+    M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
+            "don't have admin permissions?), but you can still load the "
+            "model via its full package name: nlp = spacy.load('{name}')")
+    M003 = ("Server error ({code}: {desc})")
+    M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy "
+            "installation (v{version}), and download it manually. For more "
+            "details, see the documentation: https://spacy.io/usage/models")
+    M005 = ("Compatibility error")
+    M006 = ("No compatible models found for v{version} of spaCy.")
+    M007 = ("No compatible model found for '{name}' (spaCy v{version}).")
+    M008 = ("Can't locate model data")
+    M009 = ("The data should be located in {path}")
+    M010 = ("Can't find the spaCy data path to create model symlink")
+    M011 = ("Make sure a directory `/data` exists within your spaCy "
+            "installation and try again. The data directory should be "
+            "located here:")
+    M012 = ("Link '{name}' already exists")
+    M013 = ("To overwrite an existing link, use the --force flag.")
+    M014 = ("Can't overwrite symlink '{name}'")
+    M015 = ("This can happen if your data directory contains a directory or "
+            "file of the same name.")
+    M016 = ("Error: Couldn't link model to '{name}'")
+    M017 = ("Creating a symlink in spacy/data failed. Make sure you have the "
+            "required permissions and try re-running the command as admin, or "
+            "use a virtualenv. You can still import the model as a module and "
+            "call its load() method, or create the symlink manually.")
+    M018 = ("Linking successful")
+    M019 = ("You can now load the model via spacy.load('{name}')")
+    M020 = ("Can't find model meta.json")
+    M021 = ("Couldn't fetch compatibility table.")
+    M022 = ("Can't find spaCy v{version} in compatibility table")
+    M023 = ("Installed models (spaCy v{version})")
+    M024 = ("No models found in your current environment.")
+    M025 = ("Use the following commands to update the model packages:")
+    M026 = ("The following models are not available for spaCy "
+            "v{version}: {models}")
+    M027 = ("You may also want to overwrite the incompatible links using the "
+            "`python -m spacy link` command with `--force`, or remove them "
+            "from the data directory. Data path: {path}")
+    M028 = ("Input file not found")
+    M029 = ("Output directory not found")
+    M030 = ("Unknown format")
+    M031 = ("Can't find converter for {converter}")
+    M032 = ("Generated output file {name}")
+    M033 = ("Created {n_docs} documents")
+    M034 = ("Evaluation data not found")
+    M035 = ("Visualization output directory not found")
+    M036 = ("Generated {n} parses as HTML")
+    M037 = ("Can't find words frequencies file")
+    M038 = ("Sucessfully compiled vocab")
+    M039 = ("{entries} entries, {vectors} vectors")
+    M040 = ("Output directory not found")
+    M041 = ("Loaded meta.json from file")
+    M042 = ("Successfully created package '{name}'")
+    M043 = ("To build the package, run `python setup.py sdist` in this "
+            "directory.")
+    M044 = ("Package directory already exists")
+    M045 = ("Please delete the directory and try again, or use the `--force` "
+            "flag to overwrite existing directories.")
+    M046 = ("Generating meta.json")
+    M047 = ("Enter the package settings for your model. The following "
+           "information will be read from your model data: pipeline, vectors.")
+    M048 = ("No '{key}' setting found in meta.json")
+    M049 = ("This setting is required to build your package.")
+    M050 = ("Training data not found")
+    M051 = ("Development data not found")
+    M052 = ("Not a valid meta.json format")
+    M053 = ("Expected dict but got: {meta_type}")
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -5,6 +5,7 @@ import plac
 from pathlib import Path

 from .converters import conllu2json, iob2json, conll_ner2json
+from ._messages import Messages
 from ..util import prints

 # Converters are matched by file extension. To add a converter, add a new
@ -32,14 +33,14 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto
    input_path = Path(input_file)
    output_path = Path(output_dir)
    if not input_path.exists():
-        prints(input_path, title="Input file not found", exits=1)
+        prints(input_path, title=Messages.M028, exits=1)
    if not output_path.exists():
-        prints(output_path, title="Output directory not found", exits=1)
+        prints(output_path, title=Messages.M029, exits=1)
    if converter == 'auto':
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
-            prints("Can't find converter for %s" % converter,
-                title="Unknown format", exits=1)
+            prints(Messages.M031.format(converter=converter),
+                   title=Messages.M030, exits=1)
    func = CONVERTERS[converter]
    func(input_path, output_path,
         n_sents=n_sents, use_morphology=morphology)
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+from .._messages import Messages
 from ...compat import json_dumps, path2str
 from ...util import prints
 from ...gold import iob_to_biluo
@ -18,8 +19,8 @@ def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
        f.write(json_dumps(docs))
-    prints("Created %d documents" % len(docs),
-           title="Generated output file %s" % path2str(output_file))
+    prints(Messages.M033.format(n_docs=len(docs)),
+           title=Messages.M032.format(name=path2str(output_file)))


 def read_conll_ner(input_path):
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+from .._messages import Messages
 from ...compat import json_dumps, path2str
 from ...util import prints

@ -32,8 +33,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
        f.write(json_dumps(docs))
-    prints("Created %d documents" % len(docs),
-           title="Generated output file %s" % path2str(output_file))
+    prints(Messages.M033.format(n_docs=len(docs)),
+           title=Messages.M032.format(name=path2str(output_file)))


 def read_conllx(input_path, use_morphology=False, n=0):
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 from cytoolz import partition_all, concat

+from .._messages import Messages
 from ...compat import json_dumps, path2str
 from ...util import prints
 from ...gold import iob_to_biluo
@ -18,8 +19,8 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
        f.write(json_dumps(docs))
-    prints("Created %d documents" % len(docs),
-           title="Generated output file %s" % path2str(output_file))
+    prints(Messages.M033.format(n_docs=len(docs)),
+           title=Messages.M032.format(name=path2str(output_file)))


 def read_iob(raw_sents):
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -8,6 +8,7 @@ import sys
 import ujson

 from .link import link
+from ._messages import Messages
 from ..util import prints, get_package_path
 from ..compat import url_read, HTTPError
 from .. import about
@ -32,9 +33,7 @@ def download(model, direct=False):
        version = get_version(model_name, compatibility)
        dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
                                                            v=version))
-        if dl != 0:
-            # if download subprocess doesn't return 0, exit with the respective
-            # exit code before doing anything else
+        if dl != 0:  # if download subprocess doesn't return 0, exit
            sys.exit(dl)
        try:
            # Get package path here because link uses
@ -48,22 +47,15 @@ def download(model, direct=False):
            # Dirty, but since spacy.download and the auto-linking is
            # mostly a convenience wrapper, it's best to show a success
            # message and loading instructions, even if linking fails.
-            prints(
-                "Creating a shortcut link for 'en' didn't work (maybe "
-                "you don't have admin permissions?), but you can still "
-                "load the model via its full package name:",
-                "nlp = spacy.load('%s')" % model_name,
-                title="Download successful but linking failed")
+            prints(Messages.M001.format(name=model_name), title=Messages.M002)


 def get_json(url, desc):
    try:
        data = url_read(url)
    except HTTPError as e:
-        msg = ("Couldn't fetch %s. Please find a model for your spaCy "
-               "installation (v%s), and download it manually.")
-        prints(msg % (desc, about.__version__), about.__docs_models__,
-               title="Server error (%d: %s)" % (e.code, e.reason), exits=1)
+        prints(Messages.M004.format(desc, about.__version__),
+               title=Messages.M003.format(e.code, e.reason), exits=1)
    return ujson.loads(data)


@ -73,17 +65,16 @@ def get_compatibility():
    comp_table = get_json(about.__compatibility__, "compatibility table")
    comp = comp_table['spacy']
    if version not in comp:
-        prints("No compatible models found for v%s of spaCy." % version,
-               title="Compatibility error", exits=1)
+        prints(Messages.M006.format(version=version), title=Messages.M005,
+               exits=1)
    return comp[version]


 def get_version(model, comp):
    model = model.rsplit('.dev', 1)[0]
    if model not in comp:
-        version = about.__version__
-        msg = "No compatible model found for '%s' (spaCy v%s)."
-        prints(msg % (model, version), title="Compatibility error", exits=1)
+        prints(Messages.M007.format(name=model, version=about.__version__),
+               title=Messages.M005, exits=1)
    return comp[model][0]


--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals, division, print_function
 import plac
 from timeit import default_timer as timer

+from ._messages import Messages
 from ..gold import GoldCorpus
 from ..util import prints
 from .. import util
@ -33,10 +34,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
-        prints(data_path, title="Evaluation data not found", exits=1)
+        prints(data_path, title=Messages.M034, exits=1)
    if displacy_path and not displacy_path.exists():
-        prints(displacy_path, title="Visualization output directory not found",
-               exits=1)
+        prints(displacy_path, title=Messages.M035, exits=1)
    corpus = GoldCorpus(data_path, data_path)
    nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -52,8 +52,7 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
        render_ents = 'ner' in nlp.meta.get('pipeline', [])
        render_parses(docs, displacy_path, model_name=model,
                      limit=displacy_limit, deps=render_deps, ents=render_ents)
-        msg = "Generated %s parses as HTML" % displacy_limit
-        prints(displacy_path, title=msg)
+        prints(displacy_path, title=Messages.M036.format(n=displacy_limit))


 def render_parses(docs, output_path, model_name='', limit=250, deps=True,
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -5,9 +5,10 @@ import plac
 import platform
 from pathlib import Path

+from ._messages import Messages
 from ..compat import path2str
-from .. import about
 from .. import util
+from .. import about


@plac.annotations(
@ -25,7 +26,7 @@ def info(model=None, markdown=False):
            model_path = util.get_data_path() / model
        meta_path = model_path / 'meta.json'
        if not meta_path.is_file():
-            util.prints(meta_path, title="Can't find model meta.json", exits=1)
+            util.prints(meta_path, title=Messages.M020, exits=1)
        meta = util.read_json(meta_path)
        if model_path.resolve() != model_path:
            meta['link'] = path2str(model_path)
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -11,7 +11,9 @@ from preshed.counter import PreshCounter
 import tarfile
 import gzip

+from ._messages import Messages
 from ..vectors import Vectors
+from ..errors import Warnings, user_warning
 from ..util import prints, ensure_path, get_lang_class

 try:
@ -37,16 +39,13 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=
    and word vectors.
    """
    if freqs_loc is not None and not freqs_loc.exists():
-        prints(freqs_loc, title="Can't find words frequencies file", exits=1)
+        prints(freqs_loc, title=Messages.M037, exits=1)
    clusters_loc = ensure_path(clusters_loc)
    vectors_loc = ensure_path(vectors_loc)
-
    probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
    vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
    clusters = read_clusters(clusters_loc) if clusters_loc else {}
-
    nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
-
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
@ -69,7 +68,6 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = 0
-
    lex_added = 0
    for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
        lexeme = nlp.vocab[word]
@ -89,15 +87,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
            lexeme = nlp.vocab[word]
            lexeme.is_oov = False
            lex_added += 1
-
    if len(vectors_data):
        nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
    if prune_vectors >= 1:
        nlp.vocab.prune_vectors(prune_vectors)
    vec_added = len(nlp.vocab.vectors)
-
-    prints("{} entries, {} vectors".format(lex_added, vec_added),
-           title="Sucessfully compiled vocab")
+    prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
+           title=Messages.M038)
    return nlp


@ -145,7 +141,7 @@ def read_clusters(clusters_loc):
    print("Reading clusters...")
    clusters = {}
    if ftfy is None:
-        print("Warning: No text fixing. Run pip install ftfy if necessary")
+        user_warning(Warnings.W004)
    with clusters_loc.open() as f:
        for line in tqdm(f):
            try:
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path

+from ._messages import Messages
 from ..compat import symlink_to, path2str
 from ..util import prints
 from .. import util
@ -24,40 +25,29 @@ def link(origin, link_name, force=False, model_path=None):
    else:
        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
-        prints("The data should be located in %s" % path2str(model_path),
-               title="Can't locate model data", exits=1)
+        prints(Messages.M009.format(path=path2str(model_path)),
+               title=Messages.M008, exits=1)
    data_path = util.get_data_path()
    if not data_path or not data_path.exists():
        spacy_loc = Path(__file__).parent.parent
-        prints("Make sure a directory `/data` exists within your spaCy "
-               "installation and try again. The data directory should be "
-               "located here:", path2str(spacy_loc), exits=1,
-               title="Can't find the spaCy data path to create model symlink")
+        prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
    link_path = util.get_data_path() / link_name
    if link_path.is_symlink() and not force:
-        prints("To overwrite an existing link, use the --force flag.",
-               title="Link %s already exists" % link_name, exits=1)
+        prints(Messages.M013, title=Messages.M012.format(name=link_name),
+               exits=1)
    elif link_path.is_symlink():  # does a symlink exist?
        # NB: It's important to check for is_symlink here and not for exists,
        # because invalid/outdated symlinks would return False otherwise.
        link_path.unlink()
    elif link_path.exists(): # does it exist otherwise?
        # NB: Check this last because valid symlinks also "exist".
-        prints("This can happen if your data directory contains a directory "
-               "or file of the same name.", link_path,
-               title="Can't overwrite symlink %s" % link_name, exits=1)
+        prints(Messages.M015, link_path,
+               title=Messages.M014.format(name=link_name), exits=1)
+    msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
    try:
        symlink_to(link_path, model_path)
    except:
        # This is quite dirty, but just making sure other errors are caught.
-        prints("Creating a symlink in spacy/data failed. Make sure you have "
-               "the required permissions and try re-running the command as "
-               "admin, or use a virtualenv. You can still import the model as "
-               "a module and call its load() method, or create the symlink "
-               "manually.",
-               "%s --> %s" % (path2str(model_path), path2str(link_path)),
-               title="Error: Couldn't link model to '%s'" % link_name)
+        prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
        raise
-    prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
-           "You can now load the model via spacy.load('%s')" % link_name,
-           title="Linking successful")
+    prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -5,6 +5,7 @@ import plac
 import shutil
 from pathlib import Path

+from ._messages import Messages
 from ..compat import path2str, json_dumps
 from ..util import prints
 from .. import util
@ -31,17 +32,17 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
-        prints(input_path, title="Model directory not found", exits=1)
+        prints(input_path, title=Messages.M008, exits=1)
    if not output_path or not output_path.exists():
-        prints(output_path, title="Output directory not found", exits=1)
+        prints(output_path, title=Messages.M040, exits=1)
    if meta_path and not meta_path.exists():
-        prints(meta_path, title="meta.json not found", exits=1)
+        prints(meta_path, title=Messages.M020, exits=1)

    meta_path = meta_path or input_path / 'meta.json'
    if meta_path.is_file():
        meta = util.read_json(meta_path)
        if not create_meta:  # only print this if user doesn't want to overwrite
-            prints(meta_path, title="Loaded meta.json from file")
+            prints(meta_path, title=Messages.M041)
        else:
            meta = generate_meta(input_dir, meta)
    meta = validate_meta(meta, ['lang', 'name', 'version'])
@ -57,9 +58,8 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
    create_file(main_path / 'setup.py', TEMPLATE_SETUP)
    create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
    create_file(package_path / '__init__.py', TEMPLATE_INIT)
-    prints(main_path, "To build the package, run `python setup.py sdist` in "
-           "this directory.",
-           title="Successfully created package '%s'" % model_name_v)
+    prints(main_path, Messages.M043,
+           title=Messages.M042.format(name=model_name_v))


 def create_dirs(package_path, force):
@ -67,10 +67,7 @@ def create_dirs(package_path, force):
        if force:
            shutil.rmtree(path2str(package_path))
        else:
-            prints(package_path, "Please delete the directory and try again, "
-                   "or use the --force flag to overwrite existing "
-                   "directories.", title="Package directory already exists",
-                   exits=1)
+            prints(package_path, Messages.M045, title=Messages.M044, exits=1)
    Path.mkdir(package_path, parents=True)


@ -97,9 +94,7 @@ def generate_meta(model_path, existing_meta):
    meta['vectors'] = {'width': nlp.vocab.vectors_length,
                       'vectors': len(nlp.vocab.vectors),
                       'keys': nlp.vocab.vectors.n_keys}
-    prints("Enter the package settings for your model. The following "
-           "information will be read from your model data: pipeline, vectors.",
-           title="Generating meta.json")
+    prints(Messages.M047, title=Messages.Mo46)
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
@ -111,8 +106,7 @@ def generate_meta(model_path, existing_meta):
 def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
-            prints("This setting is required to build your package.",
-                   title='No "%s" setting found in meta.json' % key, exits=1)
+            prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
    return meta


--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -7,6 +7,7 @@ import tqdm
 from thinc.neural._classes.model import Model
 from timeit import default_timer as timer

+from ._messages import Messages
 from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
@ -54,15 +55,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
-        prints(train_path, title="Training data not found", exits=1)
+        prints(train_path, title=Messages.M050, exits=1)
    if dev_path and not dev_path.exists():
-        prints(dev_path, title="Development data not found", exits=1)
+        prints(dev_path, title=Messages.M051, exits=1)
    if meta_path is not None and not meta_path.exists():
-        prints(meta_path, title="meta.json not found", exits=1)
+        prints(meta_path, title=Messages.M020, exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
-        prints("Expected dict but got: {}".format(type(meta)),
-               title="Not a valid meta.json format", exits=1)
+        prints(Messages.M053.format(meta_type=type(meta)),
+               title=Messages.M052, exits=1)
    meta.setdefault('lang', lang)
    meta.setdefault('name', 'unnamed')

--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -6,6 +6,7 @@ from pathlib import Path
 import sys
 import ujson

+from ._messages import Messages
 from ..compat import path2str, locale_escape, url_read, HTTPError
 from ..util import prints, get_data_path, read_json
 from .. import about
@ -18,14 +19,13 @@ def validate():
    try:
        data = url_read(about.__compatibility__)
    except HTTPError as e:
-        prints("Couldn't fetch compatibility table.",
-               title="Server error (%d: %s)" % (e.code, e.reason), exits=1)
+        title = Messages.M003.format(code=e.code, desc=e.reason)
+        prints(Messages.M021, title=title, exits=1)
    compat = ujson.loads(data)['spacy']
    current_compat = compat.get(about.__version__)
    if not current_compat:
        prints(about.__compatibility__, exits=1,
-               title="Can't find spaCy v{} in compatibility table"
-               .format(about.__version__))
+               title=Messages.M022.format(version=about.__version__))
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
@ -42,7 +42,7 @@ def validate():
    update_models = [m for m in incompat_models if m in current_compat]

    prints(path2str(Path(__file__).parent.parent),
-           title="Installed models (spaCy v{})".format(about.__version__))
+           title=Messages.M023.format(version=about.__version__))
    if model_links or model_pkgs:
        print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
        for name, data in model_pkgs.items():
@ -50,23 +50,16 @@ def validate():
        for name, data in model_links.items():
            print(get_model_row(current_compat, name, data, 'link'))
    else:
-        prints("No models found in your current environment.", exits=0)
-
+        prints(Messages.M024, exits=0)
    if update_models:
        cmd = '    python -m spacy download {}'
-        print("\n    Use the following commands to update the model packages:")
+        print("\n    " + Messages.M025)
        print('\n'.join([cmd.format(pkg) for pkg in update_models]))
-
    if na_models:
-        prints("The following models are not available for spaCy v{}: {}"
-               .format(about.__version__, ', '.join(na_models)))
-
+        prints(Messages.M025.format(version=about.__version__,
+                                    models=', '.join(na_models)))
    if incompat_links:
-        prints("You may also want to overwrite the incompatible links using "
-               "the `python -m spacy link` command with `--force`, or remove "
-               "them from the data directory. Data path: {}"
-               .format(path2str(get_data_path())))
-
+        prints(Messages.M027.format(path=path2str(get_data_path())))
    if incompat_models or incompat_links:
        sys.exit(1)

--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from .render import DependencyRenderer, EntityRenderer
 from ..tokens import Doc
 from ..compat import b_to_str
+from ..errors import Errors, Warnings, user_warning
 from ..util import prints, is_in_jupyter


@ -27,7 +28,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
    factories = {'dep': (DependencyRenderer, parse_deps),
                 'ent': (EntityRenderer, parse_ents)}
    if style not in factories:
-        raise ValueError("Unknown style: %s" % style)
+        raise ValueError(Errors.E087.format(style=style))
    if isinstance(docs, Doc) or isinstance(docs, dict):
        docs = [docs]
    renderer, converter = factories[style]
@ -57,12 +58,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
    render(docs, style=style, page=page, minify=minify, options=options,
           manual=manual)
    httpd = simple_server.make_server('0.0.0.0', port, app)
-    prints("Using the '%s' visualizer" % style,
-           title="Serving on port %d..." % port)
+    prints("Using the '{}' visualizer".format(style),
+           title="Serving on port {}...".format(port))
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
-        prints("Shutting down server on port %d." % port)
+        prints("Shutting down server on port {}.".format(port))
    finally:
        httpd.server_close()

@ -83,6 +84,8 @@ def parse_deps(orig_doc, options={}):
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
    """
    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
+    if not doc.is_parsed:
+        user_warning(Warnings.W005)
    if options.get('collapse_punct', True):
        spans = []
        for word in doc[:-1]:
@ -120,6 +123,8 @@ def parse_ents(doc, options={}):
    """
    ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
+    if not ents:
+        user_warning(Warnings.W006)
    title = (doc.user_data.get('title', None)
             if hasattr(doc, 'user_data') else None)
    return {'text': doc.text, 'ents': ents, 'title': title}
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -0,0 +1,297 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import os
+import warnings
+import inspect
+
+
+def add_codes(err_cls):
+    """Add error codes to string messages via class attribute names."""
+    class ErrorsWithCodes(object):
+        def __getattribute__(self, code):
+            msg = getattr(err_cls, code)
+            return '[{code}] {msg}'.format(code=code, msg=msg)
+    return ErrorsWithCodes()
+
+
+@add_codes
+class Warnings(object):
+    W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
+            "You can now call spacy.load with the path as its first argument, "
+            "and the model's meta.json will be used to determine the language "
+            "to load. For example:\nnlp = spacy.load('{path}')")
+    W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
+            "instead and pass in the strings as the `words` keyword argument, "
+            "for example:\nfrom spacy.tokens import Doc\n"
+            "doc = Doc(nlp.vocab, words=[...])")
+    W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
+            "the keyword arguments, for example tag=, lemma= or ent_type=.")
+    W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
+            "using ftfy.fix_text if necessary.")
+    W005 = ("Doc object not parsed. This means displaCy won't be able to "
+            "generate a dependency visualization for it. Make sure the Doc "
+            "was processed with a model that supports dependency parsing, and "
+            "not just a language class like `English()`. For more info, see "
+            "the docs:\nhttps://spacy.io/usage/models")
+    W006 = ("No entities to visualize found in Doc object. If this is "
+            "surprising to you, make sure the Doc was processed using a model "
+            "that supports named entity recognition, and check the `doc.ents` "
+            "property manually if necessary.")
+
+
+@add_codes
+class Errors(object):
+    E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
+    E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
+            "calls `nlp.create_pipe` with a component name that's not built "
+            "in - for example, when constructing the pipeline from a model's "
+            "meta.json. If you're using a custom component, you can write to "
+            "`Language.factories['{name}']` or remove it from the model meta "
+            "and add it via `nlp.add_pipe` instead.")
+    E003 = ("Not a valid pipeline component. Expected callable, but "
+            "got {component} (name: '{name}').")
+    E004 = ("If you meant to add a built-in component, use `create_pipe`: "
+            "`nlp.add_pipe(nlp.create_pipe('{component}'))`")
+    E005 = ("Pipeline component '{name}' returned None. If you're using a "
+            "custom component, maybe you forgot to return the processed Doc?")
+    E006 = ("Invalid constraints. You can only set one of the following: "
+            "before, after, first, last.")
+    E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
+    E008 = ("Some current components would be lost when restoring previous "
+            "pipeline state. If you added components after calling "
+            "`nlp.disable_pipes()`, you should remove them explicitly with "
+            "`nlp.remove_pipe()` before the pipeline is restored. Names of "
+            "the new components: {names}")
+    E009 = ("The `update` method expects same number of docs and golds, but "
+            "got: {n_docs} docs, {n_golds} golds.")
+    E010 = ("Word vectors set to length 0. This may be because you don't have "
+            "a model installed or loaded, or because your model doesn't "
+            "include word vectors. For more info, see the docs:\n"
+            "https://spacy.io/usage/models")
+    E011 = ("Unknown operator: '{op}'. Options: {opts}")
+    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
+    E013 = ("Error selecting action in matcher")
+    E014 = ("Uknown tag ID: {tag}")
+    E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
+            "`force=True` to overwrite.")
+    E016 = ("MultitaskObjective target should be function or one of: dep, "
+            "tag, ent, dep_tag_offset, ent_tag.")
+    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
+    E018 = ("Can't retrieve string for hash '{hash_value}'.")
+    E019 = ("Can't create transition with unknown action ID: {action}. Action "
+            "IDs are enumerated in spacy/syntax/{src}.pyx.")
+    E020 = ("Could not find a gold-standard action to supervise the "
+            "dependency parser. The tree is non-projective (i.e. it has "
+            "crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
+            "The ArcEager transition system only supports projective trees. "
+            "To learn non-projective representations, transform the data "
+            "before training and after parsing. Either pass "
+            "`make_projective=True` to the GoldParse class, or use "
+            "spacy.syntax.nonproj.preprocess_training_data.")
+    E021 = ("Could not find a gold-standard action to supervise the "
+            "dependency parser. The GoldParse was projective. The transition "
+            "system has {n_actions} actions. State at failure: {state}")
+    E022 = ("Could not find a transition with the name '{name}' in the NER "
+            "model.")
+    E023 = ("Error cleaning up beam: The same state occurred twice at "
+            "memory address {addr} and position {i}.")
+    E024 = ("Could not find an optimal move to supervise the parser. Usually, "
+            "this means the GoldParse was not correct. For example, are all "
+            "labels added to the model?")
+    E025 = ("String is too long: {length} characters. Max is 2**30.")
+    E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
+            "length {length}.")
+    E027 = ("Arguments 'words' and 'spaces' should be sequences of the same "
+            "length, or 'spaces' should be left default at None. spaces "
+            "should be a sequence of booleans, with True meaning that the "
+            "word owns a ' ' character following it.")
+    E028 = ("orths_and_spaces expects either a list of unicode string or a "
+            "list of (unicode, bool) tuples. Got bytes instance: {value}")
+    E029 = ("noun_chunks requires the dependency parse, which requires a "
+            "statistical model to be installed and loaded. For more info, see "
+            "the documentation:\nhttps://spacy.io/usage/models")
+    E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
+            "component to the pipeline with: "
+            "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+            "Alternatively, add the dependency parser, or set sentence "
+            "boundaries by setting doc[i].is_sent_start.")
+    E031 = ("Invalid token: empty string ('') at position {i}.")
+    E032 = ("Conflicting attributes specified in doc.from_array(): "
+            "(HEAD, SENT_START). The HEAD attribute currently sets sentence "
+            "boundaries implicitly, based on the tree structure. This means "
+            "the HEAD attribute would potentially override the sentence "
+            "boundaries set by SENT_START.")
+    E033 = ("Cannot load into non-empty Doc of length {length}.")
+    E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
+            "either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
+            "Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
+    E035 = ("Error creating span with start {start} and end {end} for Doc of "
+            "length {length}.")
+    E036 = ("Error calculating span: Can't find a token starting at character "
+            "offset {start}.")
+    E037 = ("Error calculating span: Can't find a token ending at character "
+            "offset {end}.")
+    E038 = ("Error finding sentence for span. Infinite loop detected.")
+    E039 = ("Array bounds exceeded while searching for root word. This likely "
+            "means the parse tree is in an invalid state. Please report this "
+            "issue here: http://github.com/explosion/spaCy/issues")
+    E040 = ("Attempt to access token at {i}, max length {max_length}.")
+    E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
+    E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.")
+    E043 = ("Refusing to write to token.sent_start if its document is parsed, "
+            "because this may cause inconsistent state.")
+    E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
+            "None, True, False")
+    E045 = ("Possibly infinite loop encountered while looking for {attr}.")
+    E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did "
+            "you forget to call the `set_extension` method?")
+    E047 = ("Can't assign a value to unregistered extension attribute "
+            "'{name}'. Did you forget to call the `set_extension` method?")
+    E048 = ("Can't import language {lang} from spacy.lang.")
+    E049 = ("Can't find spaCy data directory: '{path}'. Check your "
+            "installation and permissions, or use spacy.util.set_data_path "
+            "to customise the location if necessary.")
+    E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
+            "link, a Python package or a valid path to a data directory.")
+    E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
+            "it points to a valid package (not just a data directory).")
+    E052 = ("Can't find model directory: {path}")
+    E053 = ("Could not read meta.json from {path}")
+    E054 = ("No valid '{setting}' setting found in model meta.json.")
+    E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
+    E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
+            "original string.\nKey: {key}\nOrths: {orths}")
+    E057 = ("Stepped slices not supported in Span objects. Try: "
+            "list(tokens)[start:stop:step] instead.")
+    E058 = ("Could not retrieve vector for key {key}.")
+    E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
+    E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
+            "({rows}, {cols}).")
+    E061 = ("Bad file name: {filename}. Example of a valid file name: "
+            "'vectors.128.f.bin'")
+    E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
+            "and 63 are occupied. You can replace one by specifying the "
+            "`flag_id` explicitly, e.g. "
+            "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
+    E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 "
+            "and 63 (inclusive).")
+    E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
+            "string, the lexeme returned had an orth ID that did not match "
+            "the query string. This means that the cached lexeme structs are "
+            "mismatched to the string encoding table. The mismatched:\n"
+            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
+    E065 = ("Only one of the vector table's width and shape can be specified. "
+            "Got width {width} and shape {shape}.")
+    E066 = ("Error creating model helper for extracting columns. Can only "
+            "extract columns by positive integer. Got: {value}.")
+    E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
+            "an entity) without a preceding 'B' (beginning of an entity). "
+            "Tag sequence:\n{tags}")
+    E068 = ("Invalid BILUO tag: '{tag}'.")
+    E069 = ("Invalid gold-standard parse tree. Found cycle between word "
+            "IDs: {cycle}")
+    E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
+            "does not align with number of annotations ({n_annots}).")
+    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
+            "match the one in the vocab ({vocab_orth}).")
+    E072 = ("Error serializing lexeme: expected data length {length}, "
+            "got {bad_length}.")
+    E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
+            "are of length {length}. You can use `vocab.reset_vectors` to "
+            "clear the existing vectors and resize the table.")
+    E074 = ("Error interpreting compiled match pattern: patterns are expected "
+            "to end with the attribute {attr}. Got: {bad_attr}.")
+    E075 = ("Error accepting match: length ({length}) > maximum length "
+            "({max_len}).")
+    E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
+            "has {words} words.")
+    E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
+            "equal number of GoldParse objects ({n_golds}) in batch.")
+    E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
+            "not equal number of words in GoldParse ({words_gold}).")
+    E079 = ("Error computing states in beam: number of predicted beams "
+            "({pbeams}) does not equal number of gold beams ({gbeams}).")
+    E080 = ("Duplicate state found in beam: {key}.")
+    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+            "does not equal number of losses ({losses}).")
+    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
+            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
+            "match.")
+    E083 = ("Error setting extension: only one of default, getter, setter and "
+            "method is allowed. {n_args} keyword arguments were specified.")
+    E084 = ("Error assigning label ID {label} to span: not in StringStore.")
+    E085 = ("Can't create lexeme for string '{string}'.")
+    E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
+            "not match hash {hash_id} in StringStore.")
+    E087 = ("Unknown displaCy style: {style}.")
+    E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
+            "v2.x parser and NER models require roughly 1GB of temporary "
+            "memory per 100,000 characters in the input. This means long "
+            "texts may cause memory allocation errors. If you're not using "
+            "the parser or NER, it's probably safe to increase the "
+            "`nlp.max_length` limit. The limit is in number of characters, so "
+            "you can check whether your inputs are too long by checking "
+            "`len(text)`.")
+
+
+@add_codes
+class TempErrors(object):
+    T001 = ("Max length currently 10 for phrase matching")
+    T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length "
+            "({max_len}). Length can be set on initialization, up to 10.")
+    T003 = ("Resizing pre-trained Tagger models is not currently supported.")
+    T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
+    T005 = ("Currently history size is hard-coded to 0. Received: {value}.")
+    T006 = ("Currently history width is hard-coded to 0. Received: {value}.")
+    T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
+            "issue tracker: http://github.com/explosion/spaCy/issues")
+
+
+class ModelsWarning(UserWarning):
+    pass
+
+
+WARNINGS = {
+    'user': UserWarning,
+    'deprecation': DeprecationWarning,
+    'models': ModelsWarning,
+}
+
+
+def _get_warn_types(arg):
+    if arg == '':  # don't show any warnings
+        return []
+    if not arg or arg == 'all':  # show all available warnings
+        return WARNINGS.keys()
+    return [w_type.strip() for w_type in arg.split(',')
+            if w_type.strip() in WARNINGS]
+
+
+SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always')
+SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
+
+
+def user_warning(message):
+    _warn(message, 'user')
+
+
+def deprecation_warning(message):
+    _warn(message, 'deprecation')
+
+
+def models_warning(message):
+    _warn(message, 'models')
+
+
+def _warn(message, warn_type='user'):
+    """
+    message (unicode): The message to display.
+    category (Warning): The Warning to show.
+    """
+    if warn_type in SPACY_WARNING_TYPES:
+        category = WARNINGS[warn_type]
+        stack = inspect.stack()[-1]
+        with warnings.catch_warnings():
+            warnings.simplefilter(SPACY_WARNING_FILTER, category)
+            warnings.warn_explicit(message, category, stack[1], stack[2])
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -10,6 +10,7 @@ import itertools

 from .syntax import nonproj
 from .tokens import Doc
+from .errors import Errors
 from . import util
 from .util import minibatch

@ -28,7 +29,8 @@ def tags_to_entities(tags):
        elif tag == '-':
            continue
        elif tag.startswith('I'):
-            assert start is not None, tags[:i]
+            if start is None:
+                raise ValueError(Errors.E067.format(tags=tags[:i]))
            continue
        if tag.startswith('U'):
            entities.append((tag[2:], i, i))
@ -38,7 +40,7 @@ def tags_to_entities(tags):
            entities.append((tag[2:], start, i))
            start = None
        else:
-            raise Exception(tag)
+            raise ValueError(Errors.E068.format(tag=tag))
    return entities


@ -238,7 +240,9 @@ class GoldCorpus(object):

    @classmethod
    def _make_golds(cls, docs, paragraph_tuples):
-        assert len(docs) == len(paragraph_tuples)
+        if len(docs) != len(paragraph_tuples):
+            raise ValueError(Errors.E070.format(n_docs=len(docs),
+                                                n_annots=len(paragraph_tuples)))
        if len(docs) == 1:
            return [GoldParse.from_annot_tuples(docs[0],
                                                paragraph_tuples[0][0])]
@ -461,7 +465,7 @@ cdef class GoldParse:

        cycle = nonproj.contains_cycle(self.heads)
        if cycle is not None:
-            raise Exception("Cycle found: %s" % cycle)
+            raise ValueError(Errors.E069.format(cycle=cycle))

        if make_projective:
            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -28,6 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
 from .lang.tag_map import TAG_MAP
 from .lang.lex_attrs import LEX_ATTRS, is_stop
+from .errors import Errors
 from . import util
 from . import about

@ -217,8 +218,7 @@ class Language(object):
        for pipe_name, component in self.pipeline:
            if pipe_name == name:
                return component
-        msg = "No component '{}' found in pipeline. Available names: {}"
-        raise KeyError(msg.format(name, self.pipe_names))
+        raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names))

    def create_pipe(self, name, config=dict()):
        """Create a pipeline component from a factory.
@ -228,7 +228,7 @@ class Language(object):
        RETURNS (callable): Pipeline component.
        """
        if name not in self.factories:
-            raise KeyError("Can't find factory for '{}'.".format(name))
+            raise KeyError(Errors.E002.format(name=name))
        factory = self.factories[name]
        return factory(self, **config)

@ -253,12 +253,9 @@ class Language(object):
            >>> nlp.add_pipe(component, name='custom_name', last=True)
        """
        if not hasattr(component, '__call__'):
-            msg = ("Not a valid pipeline component. Expected callable, but "
-                   "got {}. ".format(repr(component)))
+            msg = Errors.E003.format(component=repr(component), name=name)
            if isinstance(component, basestring_) and component in self.factories:
-                msg += ("If you meant to add a built-in component, use "
-                        "create_pipe: nlp.add_pipe(nlp.create_pipe('{}'))"
-                        .format(component))
+                msg += Errors.E004.format(component=component)
            raise ValueError(msg)
        if name is None:
            if hasattr(component, 'name'):
@ -271,11 +268,9 @@ class Language(object):
            else:
                name = repr(component)
        if name in self.pipe_names:
-            raise ValueError("'{}' already exists in pipeline.".format(name))
+            raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
-            msg = ("Invalid constraints. You can only set one of the "
-                   "following: before, after, first, last.")
-            raise ValueError(msg)
+            raise ValueError(Errors.E006)
        pipe = (name, component)
        if last or not any([first, before, after]):
            self.pipeline.append(pipe)
@ -286,9 +281,8 @@ class Language(object):
        elif after and after in self.pipe_names:
            self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
        else:
-            msg = "Can't find '{}' in pipeline. Available names: {}"
-            unfound = before or after
-            raise ValueError(msg.format(unfound, self.pipe_names))
+            raise ValueError(Errors.E001.format(name=before or after,
+                                                opts=self.pipe_names))

    def has_pipe(self, name):
        """Check if a component name is present in the pipeline. Equivalent to
@ -306,8 +300,7 @@ class Language(object):
        component (callable): Pipeline component.
        """
        if name not in self.pipe_names:
-            msg = "Can't find '{}' in pipeline. Available names: {}"
-            raise ValueError(msg.format(name, self.pipe_names))
+            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
        self.pipeline[self.pipe_names.index(name)] = (name, component)

    def rename_pipe(self, old_name, new_name):
@ -317,11 +310,9 @@ class Language(object):
        new_name (unicode): New name of the component.
        """
        if old_name not in self.pipe_names:
-            msg = "Can't find '{}' in pipeline. Available names: {}"
-            raise ValueError(msg.format(old_name, self.pipe_names))
+            raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names))
        if new_name in self.pipe_names:
-            msg = "'{}' already exists in pipeline. Existing names: {}"
-            raise ValueError(msg.format(new_name, self.pipe_names))
+            raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names))
        i = self.pipe_names.index(old_name)
        self.pipeline[i] = (new_name, self.pipeline[i][1])

@ -332,8 +323,7 @@ class Language(object):
        RETURNS (tuple): A `(name, component)` tuple of the removed component.
        """
        if name not in self.pipe_names:
-            msg = "Can't find '{}' in pipeline. Available names: {}"
-            raise ValueError(msg.format(name, self.pipe_names))
+            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
        return self.pipeline.pop(self.pipe_names.index(name))

    def __call__(self, text, disable=[]):
@ -351,21 +341,17 @@ class Language(object):
            ('An', 'NN')
        """
        if len(text) >= self.max_length:
-            msg = (
-                "Text of length {length} exceeds maximum of {max_length}. "
-                "The v2 parser and NER models require roughly 1GB of temporary "
-                "memory per 100,000 characters in the input. This means long "
-                "texts may cause memory allocation errors. If you're not using "
-                "the parser or NER, it's probably safe to increase the "
-                "nlp.max_length limit. The limit is in number of characters, "
-                "so you can check whether your inputs are too long by checking "
-                "len(text).")
-            raise ValueError(msg.format(length=len(text), max_length=self.max_length))
+            raise ValueError(Errors.E088.format(length=len(text),
+                                                max_length=self.max_length))
        doc = self.make_doc(text)
        for name, proc in self.pipeline:
            if name in disable:
                continue
+            if not hasattr(proc, '__call__'):
+                raise ValueError(Errors.E003.format(component=type(proc), name=name))
            doc = proc(doc)
+            if doc is None:
+                raise ValueError(Errors.E005.format(name=name))
        return doc

    def disable_pipes(self, *names):
@ -407,8 +393,7 @@ class Language(object):
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        if len(docs) != len(golds):
-            raise IndexError("Update expects same number of docs and golds "
-                             "Got: %d, %d" % (len(docs), len(golds)))
+            raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
        if len(docs) == 0:
            return
        if sgd is None:
@ -757,14 +742,7 @@ class DisabledPipes(list):
        if unexpected:
            # Don't change the pipeline if we're raising an error.
            self.nlp.pipeline = current
-            msg = (
-                "Some current components would be lost when restoring "
-                "previous pipeline state. If you added components after "
-                "calling nlp.disable_pipes(), you should remove them "
-                "explicitly with nlp.remove_pipe() before the pipeline is "
-                "restore. Names of the new components: %s"
-            )
-            raise ValueError(msg % unexpected)
+            raise ValueError(Errors.E008.format(names=unexpected))
        self[:] = []


--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
 from .attrs cimport PROB
 from .attrs import intify_attrs
-from . import about
+from .errors import Errors


 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@ -37,7 +37,8 @@ cdef class Lexeme:
        self.vocab = vocab
        self.orth = orth
        self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
-        assert self.c.orth == orth
+        if self.c.orth != orth:
+            raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))

    def __richcmp__(self, other, int op):
        if other is None:
@ -129,20 +130,25 @@ cdef class Lexeme:
        lex_data = Lexeme.c_to_bytes(self.c)
        start = <const char*>&self.c.flags
        end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
-        assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
+        if (end-start) != sizeof(lex_data.data):
+            raise ValueError(Errors.E072.format(length=end-start,
+                                                bad_length=sizeof(lex_data.data)))
        byte_string = b'\0' * sizeof(lex_data.data)
        byte_chars = <char*>byte_string
        for i in range(sizeof(lex_data.data)):
            byte_chars[i] = lex_data.data[i]
-        assert len(byte_string) == sizeof(lex_data.data), (len(byte_string),
-                sizeof(lex_data.data))
+        if len(byte_string) != sizeof(lex_data.data):
+            raise ValueError(Errors.E072.format(length=len(byte_string),
+                                                bad_length=sizeof(lex_data.data)))
        return byte_string

    def from_bytes(self, bytes byte_string):
        # This method doesn't really have a use-case --- wrote it for testing.
        # Possibly delete? It puts the Lexeme out of synch with the vocab.
        cdef SerializedLexemeC lex_data
-        assert len(byte_string) == sizeof(lex_data.data)
+        if len(byte_string) != sizeof(lex_data.data):
+            raise ValueError(Errors.E072.format(length=len(byte_string),
+                                                bad_length=sizeof(lex_data.data)))
        for i in range(len(byte_string)):
            lex_data.data[i] = byte_string[i]
        Lexeme.c_from_bytes(self.c, lex_data)
@ -169,16 +175,13 @@ cdef class Lexeme:
        def __get__(self):
            cdef int length = self.vocab.vectors_length
            if length == 0:
-                raise ValueError(
-                    "Word vectors set to length 0. This may be because you "
-                    "don't have a model installed or loaded, or because your "
-                    "model doesn't include word vectors. For more info, see "
-                    "the documentation: \n%s\n" % about.__docs_models__
-                )
+                raise ValueError(Errors.E010)
            return self.vocab.get_vector(self.c.orth)

        def __set__(self, vector):
-            assert len(vector) == self.vocab.vectors_length
+            if len(vector) != self.vocab.vectors_length:
+                raise ValueError(Errors.E073.format(new_length=len(vector),
+                                                    length=self.vocab.vectors_length))
            self.vocab.set_vector(self.c.orth, vector)

    property rank:
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -16,6 +16,7 @@ from .typedefs cimport hash_t
 from .structs cimport TokenC
 from .tokens.doc cimport Doc, get_token_attr
 from .vocab cimport Vocab
+from .errors import Errors, TempErrors

 from .attrs import IDS
 from .attrs cimport attr_id_t, ID, NULL_ATTR
@ -109,7 +110,8 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
    while pattern.nr_attr != 0:
        pattern += 1
    id_attr = pattern[0].attrs[0]
-    assert id_attr.attr == ID
+    if id_attr.attr != ID:
+        raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
    return id_attr.value


@ -161,8 +163,8 @@ def _convert_strings(token_specs, string_store):
                if value in operators:
                    ops = operators[value]
                else:
-                    msg = "Unknown operator '%s'. Options: %s"
-                    raise KeyError(msg % (value, ', '.join(operators.keys())))
+                    keys = ', '.join(operators.keys())
+                    raise KeyError(Errors.E011.format(op=value, opts=keys))
            if isinstance(attr, basestring):
                attr = IDS.get(attr.upper())
            if isinstance(value, basestring):
@ -264,9 +266,7 @@ cdef class Matcher:
        """
        for pattern in patterns:
            if len(pattern) == 0:
-                msg = ("Cannot add pattern for zero tokens to matcher.\n"
-                       "key: {key}\n")
-                raise ValueError(msg.format(key=key))
+                raise ValueError(Errors.E012.format(key=key))
        key = self._normalize_key(key)
        for pattern in patterns:
            specs = _convert_strings(pattern, self.vocab.strings)
@ -348,13 +348,12 @@ cdef class Matcher:
            for state in partials:
                action = get_action(state.second, token)
                if action == PANIC:
-                    raise Exception("Error selecting action in matcher")
+                    raise ValueError(Errors.E013)
                while action == ADVANCE_ZERO:
                    state.second += 1
                    action = get_action(state.second, token)
                if action == PANIC:
-                    raise Exception("Error selecting action in matcher")
-
+                    raise ValueError(Errors.E013)
                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
                    # (i.e. we don't overwrite -- we want to greedily match
@ -380,7 +379,7 @@ cdef class Matcher:
            for pattern in self.patterns:
                action = get_action(pattern, token)
                if action == PANIC:
-                    raise Exception("Error selecting action in matcher")
+                    raise ValueError(Errors.E013)
                while action == ADVANCE_ZERO:
                    pattern += 1
                    action = get_action(pattern, token)
@ -447,7 +446,7 @@ def get_bilou(length):
        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
                I10_ENT, I10_ENT, L10_ENT]
    else:
-        raise ValueError("Max length currently 10 for phrase matching")
+        raise ValueError(TempErrors.T001)


 cdef class PhraseMatcher:
@ -506,11 +505,8 @@ cdef class PhraseMatcher:
        cdef Doc doc
        for doc in docs:
            if len(doc) >= self.max_length:
-                msg = (
-                    "Pattern length (%d) >= phrase_matcher.max_length (%d). "
-                    "Length can be set on initialization, up to 10."
-                )
-                raise ValueError(msg % (len(doc), self.max_length))
+                raise ValueError(TempErrors.T002.format(doc_len=len(doc),
+                                                        max_len=self.max_length))
        cdef hash_t ent_id = self.matcher._normalize_key(key)
        self._callbacks[ent_id] = on_match
        cdef int length
@ -562,7 +558,9 @@ cdef class PhraseMatcher:
            yield doc

    def accept_match(self, Doc doc, int start, int end):
-        assert (end - start) < self.max_length
+        if (end - start) >= self.max_length:
+            raise ValueError(Errors.E075.format(length=end - start,
+                                                max_len=self.max_length))
        cdef int i, j
        for i in range(self.max_length):
            self._phrase_key[i] = 0
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
 from .parts_of_speech cimport SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
+from .errors import Errors


 def _normalize_props(props):
@ -93,7 +94,7 @@ cdef class Morphology:

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
        if tag_id > self.n_tags:
-            raise ValueError("Unknown tag ID: %s" % tag_id)
+            raise ValueError(Errors.E014.format(tag=tag_id))
        # TODO: It's pretty arbitrary to put this logic here. I guess the
        # justification is that this is where the specific word and the tag
        # interact. Still, we should have a better way to enforce this rule, or
@ -147,9 +148,7 @@ cdef class Morphology:
        elif force:
            memset(cached, 0, sizeof(cached[0]))
        else:
-            raise ValueError(
-                "Conflicting morphology exception for (%s, %s). Use "
-                "force=True to overwrite." % (tag_str, orth_str))
+            raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))

        cached.tag = rich_tag
        # TODO: Refactor this to take arbitrary attributes.
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -33,6 +33,7 @@ from .parts_of_speech import X
 from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
 from ._ml import link_vectors_to_models, zero_init, flatten
 from ._ml import create_default_optimizer
+from .errors import Errors, TempErrors
 from . import util


@ -336,7 +337,8 @@ class Tensorizer(Pipe):
        tensors (object): Vector representation for each token in the docs.
        """
        for doc, tensor in zip(docs, tensors):
-            assert tensor.shape[0] == len(doc)
+            if tensor.shape[0] != len(doc):
+                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
            doc.tensor = tensor

    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
@ -550,9 +552,7 @@ class Tagger(Pipe):
            # copy_array(larger.W[:smaller.nO], smaller.W)
            # copy_array(larger.b[:smaller.nO], smaller.b)
            # self.model._layers[-1] = larger
-            raise ValueError(
-                "Resizing pre-trained Tagger models is not "
-                "currently supported.")
+            raise ValueError(TempErrors.T003)
        tag_map = dict(self.vocab.morphology.tag_map)
        if values is None:
            values = {POS: "X"}
@ -671,8 +671,7 @@ class MultitaskObjective(Tagger):
        elif hasattr(target, '__call__'):
            self.make_label = target
        else:
-            raise ValueError("MultitaskObjective target should be function or "
-                             "one of: dep, tag, ent, dep_tag_offset, ent_tag.")
+            raise ValueError(Errors.E016)
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)

@ -723,7 +722,9 @@ class MultitaskObjective(Tagger):
        return tokvecs, scores

    def get_loss(self, docs, golds, scores):
-        assert len(docs) == len(golds)
+        if len(docs) != len(golds):
+            raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs),
+                                                n_golds=len(golds)))
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype='i')
        guesses = scores.argmax(axis=1)
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -2,6 +2,7 @@
 from __future__ import division, print_function, unicode_literals

 from .gold import tags_to_entities
+from .errors import Errors


 class PRFScore(object):
@ -84,7 +85,8 @@ class Scorer(object):
        }

    def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
-        assert len(tokens) == len(gold)
+        if len(tokens) != len(gold):
+            raise ValueError(Errors.E078.format(words_doc=len(tokens), words_gold=len(gold)))
        gold_deps = set()
        gold_tags = set()
        gold_ents = set(tags_to_entities([annot[-1]
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -13,6 +13,7 @@ from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 from .typedefs cimport hash_t
 from .compat import json_dumps
+from .errors import Errors
 from . import util


@ -59,7 +60,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
        string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
        string.p[0] = length
        memcpy(&string.p[1], chars, length)
-        assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
        return string
    else:
        i = 0
@ -69,7 +69,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
            string.p[i] = 255
        string.p[n_length_bytes-1] = length % 255
        memcpy(&string.p[n_length_bytes], chars, length)
-        assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
        return string


@ -115,7 +114,7 @@ cdef class StringStore:
            self.hits.insert(key)
            utf8str = <Utf8Str*>self._map.get(key)
            if utf8str is NULL:
-                raise KeyError(string_or_id)
+                raise KeyError(Errors.E018.format(hash_value=string_or_id))
            else:
                return decode_Utf8Str(utf8str)

@ -136,8 +135,7 @@ cdef class StringStore:
            key = hash_utf8(string, len(string))
            self._intern_utf8(string, len(string))
        else:
-            raise TypeError(
-                "Can only add unicode or bytes. Got type: %s" % type(string))
+            raise TypeError(Errors.E017.format(value_type=type(string)))
        return key

    def __len__(self):
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -10,6 +10,7 @@ from thinc.extra.search cimport MaxViolation

 from .transition_system cimport TransitionSystem, Transition
 from ..gold cimport GoldParse
+from ..errors import Errors
 from .stateclass cimport StateC, StateClass


@ -220,7 +221,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
    p_indices = []
    g_indices = []
    cdef Beam pbeam, gbeam
-    assert len(pbeams) == len(gbeams)
+    if len(pbeams) != len(gbeams):
+        raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
    for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
        p_indices.append([])
        g_indices.append([])
@ -228,7 +230,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
            state = StateClass.borrow(<StateC*>pbeam.at(i))
            if not state.is_final():
                key = tuple([eg_id] + pbeam.histories[i])
-                assert key not in seen, (key, seen)
+                if key in seen:
+                    raise ValueError(Errors.E080.format(key=key))
                seen[key] = len(states)
                p_indices[-1].append(len(states))
                states.append(state)
@ -271,7 +274,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
    for i in range(nr_step):
        grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
                                 dtype='f'))
-    assert len(histories) == len(losses)
+    if len(histories) != len(losses):
+        raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
    for eg_id, hists in enumerate(histories):
        for loss, hist in zip(losses[eg_id], hists):
            if loss == 0.0 or numpy.isnan(loss):
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -15,6 +15,7 @@ from .nonproj import is_nonproj_tree
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse, GoldParseC
 from ..structs cimport TokenC
+from ..errors import Errors


 DEF NON_MONOTONIC = True
@ -455,7 +456,7 @@ cdef class ArcEager(TransitionSystem):
            t.do = Break.transition
            t.get_cost = Break.cost
        else:
-            raise Exception(move)
+            raise ValueError(Errors.E019.format(action=move, src='arc_eager'))
        return t

    cdef int initialize_state(self, StateC* st) nogil:
@ -529,28 +530,11 @@ cdef class ArcEager(TransitionSystem):
        if n_gold < 1:
            # Check projectivity --- leading cause
            if is_nonproj_tree(gold.heads):
-                raise ValueError(
-                    "Could not find a gold-standard action to supervise the "
-                    "dependency parser. Likely cause: the tree is "
-                    "non-projective (i.e. it has crossing arcs -- see "
-                    "spacy/syntax/nonproj.pyx for definitions). The ArcEager "
-                    "transition system only supports projective trees. To "
-                    "learn non-projective representations, transform the data "
-                    "before training and after parsing. Either pass "
-                    "make_projective=True to the GoldParse class, or use "
-                    "spacy.syntax.nonproj.preprocess_training_data.")
+                raise ValueError(Errors.E020)
            else:
-                print(gold.orig_annot)
-                print(gold.words)
-                print(gold.heads)
-                print(gold.labels)
-                print(gold.sent_starts)
-                raise ValueError(
-                    "Could not find a gold-standard action to supervise the"
-                    "dependency parser. The GoldParse was projective. The "
-                    "transition system has %d actions. State at failure: %s"
-                    % (self.n_moves, stcls.print_state(gold.words)))
-        assert n_gold >= 1
+                failure_state = stcls.print_state(gold.words)
+                raise ValueError(Errors.E021.format(n_actions=self.n_moves,
+                                                    state=failure_state))

    def get_beam_annot(self, Beam beam):
        length = (<StateC*>beam.at(0)).length
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -10,6 +10,7 @@ from ._state cimport StateC
 from .transition_system cimport Transition
 from .transition_system cimport do_func_t
 from ..gold cimport GoldParseC, GoldParse
+from ..errors import Errors


 cdef enum:
@ -173,7 +174,7 @@ cdef class BiluoPushDown(TransitionSystem):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
        else:
-            raise KeyError(name)
+            raise KeyError(Errors.E022.format(name=name))

    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
@ -208,7 +209,7 @@ cdef class BiluoPushDown(TransitionSystem):
            t.do = Out.transition
            t.get_cost = Out.cost
        else:
-            raise Exception(move)
+            raise ValueError(Errors.E019.format(action=move, src='ner'))
        return t

    def add_action(self, int action, label_name):
@ -230,7 +231,6 @@ cdef class BiluoPushDown(TransitionSystem):
            self._size *= 2
            self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
-        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
        return 1

--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -34,6 +34,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer
 from ..compat import json_dumps, copy_array
 from ..tokens.doc cimport Doc
 from ..gold cimport GoldParse
+from ..errors import Errors, TempErrors
 from .. import util
 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -242,7 +243,7 @@ cdef class Parser:
    def Model(cls, nr_class, **cfg):
        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
        if depth != 1:
-            raise ValueError("Currently parser depth is hard-coded to 1.")
+            raise ValueError(TempErrors.T004.format(value=depth))
        parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
                                            cfg.get('maxout_pieces', 2))
        token_vector_width = util.env_opt('token_vector_width',
@ -252,9 +253,9 @@ cdef class Parser:
        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
        if hist_size != 0:
-            raise ValueError("Currently history size is hard-coded to 0")
+            raise ValueError(TempErrors.T005.format(value=hist_size))
        if hist_width != 0:
-            raise ValueError("Currently history width is hard-coded to 0")
+            raise ValueError(TempErrors.T006.format(value=hist_width))
        pretrained_vectors = cfg.get('pretrained_vectors', None)
        tok2vec = Tok2Vec(token_vector_width, embed_size,
                          pretrained_vectors=pretrained_vectors)
@ -542,7 +543,9 @@ cdef class Parser:
    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
-        assert len(docs) == len(golds)
+        if len(docs) != len(golds):
+            raise ValueError(Errors.E077.format(value='update', n_docs=len(docs),
+                                                n_golds=len(golds)))
        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
            return self.update_beam(docs, golds,
                    self.cfg['beam_width'], self.cfg['beam_density'],
@ -622,7 +625,6 @@ cdef class Parser:
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        lengths = [len(d) for d in docs]
-        assert min(lengths) >= 1
        states = self.moves.init_batch(docs)
        for gold in golds:
            self.moves.preprocess_gold(gold)
@ -1021,15 +1023,11 @@ def _cleanup(Beam beam):
            del state
            seen.add(addr)
        else:
-            print(i, addr)
-            print(seen)
-            raise Exception
+            raise ValueError(Errors.E023.format(addr=addr, i=i))
        addr = <size_t>beam._states[i].content
        if addr not in seen:
            state = <StateC*>addr
            del state
            seen.add(addr)
        else:
-            print(i, addr)
-            print(seen)
-            raise Exception
+            raise ValueError(Errors.E023.format(addr=addr, i=i))
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -10,6 +10,7 @@ from __future__ import unicode_literals
 from copy import copy

 from ..tokens.doc cimport Doc
+from ..errors import Errors


 DELIMITER = '||'
@ -131,7 +132,10 @@ cpdef deprojectivize(Doc doc):

 def _decorate(heads, proj_heads, labels):
    # uses decoration scheme HEAD from Nivre & Nilsson 2005
-    assert(len(heads) == len(proj_heads) == len(labels))
+    if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
+        raise ValueError(Errors.E082.format(n_heads=len(heads),
+                                            n_proj_heads=len(proj_heads),
+                                            n_labels=len(labels)))
    deco_labels = []
    for tokenid, head in enumerate(heads):
        if head != proj_heads[tokenid]:
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -12,6 +12,7 @@ from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..typedefs cimport attr_t
 from ..compat import json_dumps
+from ..errors import Errors
 from .. import util


@ -80,10 +81,7 @@ cdef class TransitionSystem:
                    action.do(state.c, action.label)
                    break
            else:
-                print(gold.words)
-                print(gold.ner)
-                print(history)
-                raise ValueError("Could not find gold move")
+                raise ValueError(Errors.E024)
        return history

    cdef int initialize_state(self, StateC* state) nogil:
@ -130,17 +128,7 @@ cdef class TransitionSystem:
            else:
                costs[i] = 9000
        if n_gold <= 0:
-            print(gold.words)
-            print(gold.ner)
-            print([gold.c.ner[i].clas for i in range(gold.length)])
-            print([gold.c.ner[i].move for i in range(gold.length)])
-            print([gold.c.ner[i].label for i in range(gold.length)])
-            print("Self labels",
-                  [self.c[i].label for i in range(self.n_moves)])
-            raise ValueError(
-                "Could not find a gold-standard action to supervise "
-                "the entity recognizer. The transition system has "
-                "%d actions." % (self.n_moves))
+            raise ValueError(Errors.E024)

    def get_class_name(self, int clas):
        act = self.c[clas]
@ -162,7 +150,6 @@ cdef class TransitionSystem:
            self._size *= 2
            self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
-        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
        return 1

--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -13,6 +13,7 @@ cimport cython

 from .tokens.doc cimport Doc
 from .strings cimport hash_string
+from .errors import Errors, Warnings, deprecation_warning
 from . import util


@ -63,11 +64,7 @@ cdef class Tokenizer:
        return (self.__class__, args, None, None)

    cpdef Doc tokens_from_list(self, list strings):
-        util.deprecated(
-            "Tokenizer.from_list is now deprecated. Create a new Doc "
-            "object instead and pass in the strings as the `words` keyword "
-            "argument, for example:\nfrom spacy.tokens import Doc\n"
-            "doc = Doc(nlp.vocab, words=[...])")
+        deprecation_warning(Warnings.W002)
        return Doc(self.vocab, words=strings)

    @cython.boundscheck(False)
@ -78,8 +75,7 @@ cdef class Tokenizer:
        RETURNS (Doc): A container for linguistic annotations.
        """
        if len(string) >= (2 ** 30):
-            msg = "String is too long: %d characters. Max is 2**30."
-            raise ValueError(msg % len(string))
+            raise ValueError(Errors.E025.format(length=len(string)))
        cdef int length = len(string)
        cdef Doc doc = Doc(self.vocab)
        if length == 0:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -31,7 +31,7 @@ from ..attrs cimport ENT_TYPE, SENT_START
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
 from ..util import normalize_slice
 from ..compat import is_config, copy_reg, pickle, basestring_
-from .. import about
+from ..errors import Errors, Warnings, deprecation_warning
 from .. import util
 from .underscore import Underscore
 from ._retokenize import Retokenizer
@ -41,9 +41,9 @@ DEF PADDING = 5

 cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
-        raise IndexError
+        raise IndexError(Errors.E026.format(i=i, length=length))
    if (i - padding) >= length:
-        raise IndexError
+        raise IndexError(Errors.E026.format(i=i, length=length))


 cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
@ -98,7 +98,8 @@ cdef class Doc:
    def set_extension(cls, name, default=None, method=None,
                      getter=None, setter=None):
        nr_defined = sum(t is not None for t in (default, getter, setter, method))
-        assert nr_defined == 1
+        if nr_defined != 1:
+            raise ValueError(Errors.E083.format(n_args=nr_defined))
        Underscore.doc_extensions[name] = (default, method, getter, setter)

    @classmethod
@ -155,11 +156,7 @@ cdef class Doc:
            if spaces is None:
                spaces = [True] * len(words)
            elif len(spaces) != len(words):
-                raise ValueError(
-                    "Arguments 'words' and 'spaces' should be sequences of "
-                    "the same length, or 'spaces' should be left default at "
-                    "None. spaces should be a sequence of booleans, with True "
-                    "meaning that the word owns a ' ' character following it.")
+                raise ValueError(Errors.E027)
            orths_and_spaces = zip(words, spaces)
        if orths_and_spaces is not None:
            for orth_space in orths_and_spaces:
@ -167,10 +164,7 @@ cdef class Doc:
                    orth = orth_space
                    has_space = True
                elif isinstance(orth_space, bytes):
-                    raise ValueError(
-                        "orths_and_spaces expects either List(unicode) or "
-                        "List((unicode, bool)). "
-                        "Got bytes instance: %s" % (str(orth_space)))
+                    raise ValueError(Errors.E028.format(value=orth_space))
                else:
                    orth, has_space = orth_space
                # Note that we pass self.mem here --- we have ownership, if LexemeC
@ -504,11 +498,7 @@ cdef class Doc:
        """
        def __get__(self):
            if not self.is_parsed:
-                raise ValueError(
-                    "noun_chunks requires the dependency parse, which "
-                    "requires a statistical model to be installed and loaded. "
-                    "For more info, see the "
-                    "documentation: \n%s\n" % about.__docs_models__)
+                raise ValueError(Errors.E029)
            # Accumulate the result before beginning to iterate over it. This
            # prevents the tokenisation from being changed out from under us
            # during the iteration. The tricky thing here is that Span accepts
@ -533,12 +523,7 @@ cdef class Doc:
        """
        def __get__(self):
            if not self.is_sentenced:
-                raise ValueError(
-                    "Sentence boundaries unset. You can add the 'sentencizer' "
-                    "component to the pipeline with: "
-                    "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
-                    "Alternatively, add the dependency parser, or set "
-                    "sentence boundaries by setting doc[i].sent_start")
+                raise ValueError(Errors.E030)
            if 'sents' in self.user_hooks:
                yield from self.user_hooks['sents'](self)
            else:
@ -568,7 +553,8 @@ cdef class Doc:
            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
        t.l_edge = self.length
        t.r_edge = self.length
-        assert t.lex.orth != 0
+        if t.lex.orth == 0:
+            raise ValueError(Errors.E031.format(i=self.length))
        t.spacy = has_space
        self.length += 1
        return t.idx + t.lex.length + t.spacy
@ -684,13 +670,7 @@ cdef class Doc:

    def from_array(self, attrs, array):
        if SENT_START in attrs and HEAD in attrs:
-            raise ValueError(
-                "Conflicting attributes specified in doc.from_array(): "
-                "(HEAD, SENT_START)\n"
-                "The HEAD attribute currently sets sentence boundaries "
-                "implicitly, based on the tree structure. This means the HEAD "
-                "attribute would potentially override the sentence boundaries "
-                "set by SENT_START.")
+            raise ValueError(Errors.E032)
        cdef int i, col
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
@ -828,7 +808,7 @@ cdef class Doc:
        RETURNS (Doc): Itself.
        """
        if self.length != 0:
-            raise ValueError("Cannot load into non-empty Doc")
+            raise ValueError(Errors.E033.format(length=self.length))
        deserializers = {
            'text': lambda b: None,
            'array_head': lambda b: None,
@ -916,10 +896,7 @@ cdef class Doc:
        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
-            util.deprecated(
-                "Positional arguments to Doc.merge are deprecated. Instead, "
-                "use the keyword arguments, for example tag=, lemma= or "
-                "ent_type=.")
+            deprecation_warning(Warnings.W003)
            tag, lemma, ent_type = args
            attributes[TAG] = tag
            attributes[LEMMA] = lemma
@ -933,13 +910,9 @@ cdef class Doc:
            if 'ent_type' in attributes:
                attributes[ENT_TYPE] = attributes['ent_type']
        elif args:
-            raise ValueError(
-                "Doc.merge received %d non-keyword arguments. Expected either "
-                "3 arguments (deprecated), or 0 (use keyword arguments). "
-                "Arguments supplied:\n%s\n"
-                "Keyword arguments: %s\n" % (len(args), repr(args),
-                                             repr(attributes)))
-
+            raise ValueError(Errors.E034.format(n_args=len(args),
+                                                args=repr(args),
+                                                kwargs=repr(attributes)))
        # More deprecated attribute handling =/
        if 'label' in attributes:
            attributes['ent_type'] = attributes.pop('label')
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -16,7 +16,7 @@ from ..util import normalize_slice
 from ..attrs cimport IS_PUNCT, IS_SPACE
 from ..lexeme cimport Lexeme
 from ..compat import is_config
-from .. import about
+from ..errors import Errors, TempErrors
 from .underscore import Underscore


@ -48,8 +48,7 @@ cdef class Span:
        RETURNS (Span): The newly constructed object.
        """
        if not (0 <= start <= end <= len(doc)):
-            raise IndexError
-
+            raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
        self.doc = doc
        self.start = start
        self.start_char = self.doc[start].idx if start < self.doc.length else 0
@ -58,7 +57,8 @@ cdef class Span:
            self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
        else:
            self.end_char = 0
-        assert label in doc.vocab.strings, label
+        if label not in doc.vocab.strings:
+            raise ValueError(Errors.E084.format(label=label))
        self.label = label
        self._vector = vector
        self._vector_norm = vector_norm
@ -267,11 +267,10 @@ cdef class Span:
        or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
            start = token_by_start(self.doc.c, self.doc.length, self.start_char)
            if self.start == -1:
-                raise IndexError("Error calculating span: Can't find start")
+                raise IndexError(Errors.E036.format(start=self.start_char))
            end = token_by_end(self.doc.c, self.doc.length, self.end_char)
            if end == -1:
-                raise IndexError("Error calculating span: Can't find end")
-
+                raise IndexError(Errors.E037.format(end=self.end_char))
            self.start = start
            self.end = end + 1

@ -293,7 +292,7 @@ cdef class Span:
                root += root.head
                n += 1
                if n >= self.doc.length:
-                    raise RuntimeError
+                    raise RuntimeError(Errors.E038)
            return self.doc[root.l_edge:root.r_edge + 1]

    property has_vector:
@ -376,11 +375,7 @@ cdef class Span:
        """
        def __get__(self):
            if not self.doc.is_parsed:
-                raise ValueError(
-                    "noun_chunks requires the dependency parse, which "
-                    "requires a statistical model to be installed and loaded. "
-                    "For more info, see the "
-                    "documentation: \n%s\n" % about.__docs_models__)
+                raise ValueError(Errors.E029)
            # Accumulate the result before beginning to iterate over it. This
            # prevents the tokenisation from being changed out from under us
            # during the iteration. The tricky thing here is that Span accepts
@ -526,9 +521,7 @@ cdef class Span:
            return self.root.ent_id

        def __set__(self, hash_t key):
-            raise NotImplementedError(
-                "Can't yet set ent_id from Span. Vote for this feature on "
-                "the issue tracker: http://github.com/explosion/spaCy/issues")
+            raise NotImplementedError(TempErrors.T007.format(attr='ent_id'))

    property ent_id_:
        """RETURNS (unicode): The (string) entity ID."""
@ -536,9 +529,7 @@ cdef class Span:
            return self.root.ent_id_

        def __set__(self, hash_t key):
-            raise NotImplementedError(
-                "Can't yet set ent_id_ from Span. Vote for this feature on the "
-                "issue tracker: http://github.com/explosion/spaCy/issues")
+            raise NotImplementedError(TempErrors.T007.format(attr='ent_id_'))

    property orth_:
        """Verbatim text content (identical to Span.text). Exists mostly for
@ -586,9 +577,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
        token += token.head
        n += 1
        if n >= sent_length:
-            raise RuntimeError(
-                "Array bounds exceeded while searching for root word. This "
-                "likely means the parse tree is in an invalid state. Please "
-                "report this issue here: "
-                "http://github.com/explosion/spaCy/issues")
+            raise RuntimeError(Errors.E039)
    return n
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t
 from ..parts_of_speech cimport univ_pos_t
 from .doc cimport Doc
 from ..lexeme cimport Lexeme
+from ..errors import Errors


 cdef class Token:
@ -17,8 +18,7 @@ cdef class Token:
    @staticmethod
    cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
        if offset < 0 or offset >= doc.length:
-            msg = "Attempt to access token at %d, max length %d"
-            raise IndexError(msg % (offset, doc.length))
+            raise IndexError(Errors.E040.format(i=offset, max_length=doc.length))
        cdef Token self = Token.__new__(Token, vocab, doc, offset)
        return self

--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -19,8 +19,8 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM
 from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
 from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
 from ..compat import is_config
+from ..errors import Errors
 from .. import util
-from .. import about
 from .underscore import Underscore


@ -106,7 +106,7 @@ cdef class Token:
        elif op == 5:
            return my >= their
        else:
-            raise ValueError(op)
+            raise ValueError(Errors.E041.format(op=op))

    @property
    def _(self):
@ -135,8 +135,7 @@ cdef class Token:
        RETURNS (Token): The token at position `self.doc[self.i+i]`.
        """
        if self.i+i < 0 or (self.i+i >= len(self.doc)):
-            msg = "Error accessing doc[%d].nbor(%d), for doc of length %d"
-            raise IndexError(msg % (self.i, i, len(self.doc)))
+            raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
        return self.doc[self.i+i]

    def similarity(self, other):
@ -352,14 +351,7 @@ cdef class Token:

    property sent_start:
        def __get__(self):
-            # Raising a deprecation warning causes errors for autocomplete
-            #util.deprecated(
-            #    "Token.sent_start is now deprecated. Use Token.is_sent_start "
-            #    "instead, which returns a boolean value or None if the answer "
-            #    "is unknown – instead of a misleading 0 for False and 1 for "
-            #    "True. It also fixes a quirk in the old logic that would "
-            #    "always set the property to 0 for the first word of the "
-            #    "document.")
+            # Raising a deprecation warning here causes errors for autocomplete
            # Handle broken backwards compatibility case: doc[0].sent_start
            # was False.
            if self.i == 0:
@ -384,9 +376,7 @@ cdef class Token:

        def __set__(self, value):
            if self.doc.is_parsed:
-                raise ValueError(
-                    "Refusing to write to token.sent_start if its document "
-                    "is parsed, because this may cause inconsistent state.")
+                raise ValueError(Errors.E043)
            if value is None:
                self.c.sent_start = 0
            elif value is True:
@ -394,8 +384,7 @@ cdef class Token:
            elif value is False:
                self.c.sent_start = -1
            else:
-                raise ValueError("Invalid value for token.sent_start. Must be "
-                                 "one of: None, True, False")
+                raise ValueError(Errors.E044.format(value=value))

    property lefts:
        """The leftward immediate children of the word, in the syntactic
@ -413,8 +402,7 @@ cdef class Token:
                nr_iter += 1
                # This is ugly, but it's a way to guard out infinite loops
                if nr_iter >= 10000000:
-                    raise RuntimeError("Possibly infinite loop encountered "
-                                       "while looking for token.lefts")
+                    raise RuntimeError(Errors.E045.format(attr='token.lefts'))

    property rights:
        """The rightward immediate children of the word, in the syntactic
@ -432,8 +420,7 @@ cdef class Token:
                ptr -= 1
                nr_iter += 1
                if nr_iter >= 10000000:
-                    raise RuntimeError("Possibly infinite loop encountered "
-                                       "while looking for token.rights")
+                    raise RuntimeError(Errors.E045.format(attr='token.rights'))
            tokens.reverse()
            for t in tokens:
                yield t
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@ -3,6 +3,8 @@ from __future__ import unicode_literals

 import functools

+from ..errors import Errors
+

 class Underscore(object):
    doc_extensions = {}
@ -23,7 +25,7 @@ class Underscore(object):

    def __getattr__(self, name):
        if name not in self._extensions:
-            raise AttributeError(name)
+            raise AttributeError(Errors.E046.format(name=name))
        default, method, getter, setter = self._extensions[name]
        if getter is not None:
            return getter(self._obj)
@ -34,7 +36,7 @@ class Underscore(object):

    def __setattr__(self, name, value):
        if name not in self._extensions:
-            raise AttributeError(name)
+            raise AttributeError(Errors.E047.format(name=name))
        default, method, getter, setter = self._extensions[name]
        if setter is not None:
            return setter(self._obj, value)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -11,8 +11,6 @@ import sys
 import textwrap
 import random
 from collections import OrderedDict
-import inspect
-import warnings
 from thinc.neural._classes.model import Model
 import functools
 import cytoolz
@ -22,6 +20,7 @@ import numpy.random
 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
 from .compat import import_file
+from .errors import Errors

 # Import these directly from Thinc, so that we're sure we always have the
 # same version.
@ -50,8 +49,7 @@ def get_lang_class(lang):
        try:
            module = importlib.import_module('.lang.%s' % lang, 'spacy')
        except ImportError:
-            msg = "Can't import language %s from spacy.lang."
-            raise ImportError(msg % lang)
+            raise ImportError(Errors.E048.format(lang=lang))
        LANGUAGES[lang] = getattr(module, module.__all__[0])
    return LANGUAGES[lang]

@ -108,7 +106,7 @@ def load_model(name, **overrides):
    """
    data_path = get_data_path()
    if not data_path or not data_path.exists():
-        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
+        raise IOError(Errors.E049.format(path=path2str(data_path)))
    if isinstance(name, basestring_):  # in data dir / shortcut
        if name in set([d.name for d in data_path.iterdir()]):
            return load_model_from_link(name, **overrides)
@ -118,7 +116,7 @@ def load_model(name, **overrides):
            return load_model_from_path(Path(name), **overrides)
    elif hasattr(name, 'exists'):  # Path or Path-like to model data
        return load_model_from_path(name, **overrides)
-    raise IOError("Can't find model '%s'" % name)
+    raise IOError(Errors.E050.format(name=name))


 def load_model_from_link(name, **overrides):
@ -127,9 +125,7 @@ def load_model_from_link(name, **overrides):
    try:
        cls = import_file(name, path)
    except AttributeError:
-        raise IOError(
-            "Cant' load '%s'. If you're using a shortcut link, make sure it "
-            "points to a valid package (not just a data directory)." % name)
+        raise IOError(Errors.E051.format(name=name))
    return cls.load(**overrides)


@ -173,8 +169,7 @@ def load_model_from_init_py(init_file, **overrides):
    data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
    data_path = model_path / data_dir
    if not model_path.exists():
-        msg = "Can't find model directory: %s"
-        raise ValueError(msg % path2str(data_path))
+        raise IOError(Errors.E052.format(path=path2str(data_path)))
    return load_model_from_path(data_path, meta, **overrides)


@ -186,16 +181,14 @@ def get_model_meta(path):
    """
    model_path = ensure_path(path)
    if not model_path.exists():
-        msg = "Can't find model directory: %s"
-        raise ValueError(msg % path2str(model_path))
+        raise IOError(Errors.E052.format(path=path2str(model_path)))
    meta_path = model_path / 'meta.json'
    if not meta_path.is_file():
-        raise IOError("Could not read meta.json from %s" % meta_path)
+        raise IOError(Errors.E053.format(path=meta_path))
    meta = read_json(meta_path)
    for setting in ['lang', 'name', 'version']:
        if setting not in meta or not meta[setting]:
-            msg = "No valid '%s' setting found in model meta.json"
-            raise ValueError(msg % setting)
+            raise ValueError(Errors.E054.format(setting=setting))
    return meta


@ -339,13 +332,10 @@ def update_exc(base_exceptions, *addition_dicts):
        for orth, token_attrs in additions.items():
            if not all(isinstance(attr[ORTH], unicode_)
                       for attr in token_attrs):
-                msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
-                raise ValueError(msg % (orth, token_attrs))
+                raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
            described_orth = ''.join(attr[ORTH] for attr in token_attrs)
            if orth != described_orth:
-                msg = ("Invalid tokenizer exception: ORTH values combined "
-                       "don't match original string. key='%s', orths='%s'")
-                raise ValueError(msg % (orth, described_orth))
+                raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
        exc.update(additions)
    exc = expand_exc(exc, "'", "’")
    return exc
@ -375,8 +365,7 @@ def expand_exc(excs, search, replace):

 def normalize_slice(length, start, stop, step=None):
    if not (step is None or step == 1):
-        raise ValueError("Stepped slices not supported in Span objects."
-                         "Try: list(tokens)[start:stop:step] instead.")
+        raise ValueError(Errors.E057)
    if start is None:
        start = 0
    elif start < 0:
@ -387,7 +376,6 @@ def normalize_slice(length, start, stop, step=None):
    elif stop < 0:
        stop += length
    stop = min(length, max(start, stop))
-    assert 0 <= start <= stop <= length
    return start, stop


@ -524,18 +512,6 @@ def from_disk(path, readers, exclude):
    return path


-def deprecated(message, filter='always'):
-    """Show a deprecation warning.
-
-    message (unicode): The message to display.
-    filter (unicode): Filter value.
-    """
-    stack = inspect.stack()[-1]
-    with warnings.catch_warnings():
-        warnings.simplefilter(filter, DeprecationWarning)
-        warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2])
-
-
 def print_table(data, title=None):
    """Print data in table format.

--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -14,6 +14,7 @@ from thinc.neural._classes.model import Model

 from .strings cimport StringStore, hash_string
 from .compat import basestring_, path2str
+from .errors import Errors
 from . import util

 from cython.operator cimport dereference as deref
@ -114,7 +115,7 @@ cdef class Vectors:
        """
        i = self.key2row[key]
        if i is None:
-            raise KeyError(key)
+            raise KeyError(Errors.E058.format(key=key))
        else:
            return self.data[i]

@ -215,7 +216,8 @@ cdef class Vectors:
        RETURNS: The requested key, keys, row or rows.
        """
        if sum(arg is None for arg in (key, keys, row, rows)) != 3:
-            raise ValueError("One (and only one) keyword arg must be set.")
+            bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows}
+            raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
        xp = get_array_module(self.data)
        if key is not None:
            if isinstance(key, basestring_):
@ -254,9 +256,9 @@ cdef class Vectors:
            row = self.key2row[key]
        elif row is None:
            if self.is_full:
-                raise ValueError("Cannot add new key to vectors -- full")
+                raise ValueError(Errors.E060.format(rows=self.data.shape[0],
+                                                    cols=self.data.shape[1]))
            row = deref(self._unset.begin())
-
        self.key2row[key] = row
        if vector is not None:
            self.data[row] = vector
@ -318,7 +320,7 @@ cdef class Vectors:
                width = int(dims)
                break
        else:
-            raise IOError("Expected file named e.g. vectors.128.f.bin")
+            raise IOError(Errors.E061.format(filename=path))
        bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
                                                             dtype=dtype)
        xp = get_array_module(self.data)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -16,6 +16,7 @@ from .attrs cimport PROB, LANG, ORTH, TAG
 from .structs cimport SerializedLexemeC

 from .compat import copy_reg, basestring_
+from .errors import Errors
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from .vectors import Vectors
@ -100,15 +101,9 @@ cdef class Vocab:
                    flag_id = bit
                    break
            else:
-                raise ValueError(
-                    "Cannot find empty bit for new lexical flag. All bits "
-                    "between 0 and 63 are occupied. You can replace one by "
-                    "specifying the flag_id explicitly, e.g. "
-                    "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
+                raise ValueError(Errors.E062)
        elif flag_id >= 64 or flag_id < 1:
-            raise ValueError(
-                "Invalid value for flag_id: %d. Flag IDs must be between "
-                "1 and 63 (inclusive)" % flag_id)
+            raise ValueError(Errors.E063.format(value=flag_id))
        for lex in self:
            lex.set_flag(flag_id, flag_getter(lex.orth_))
        self.lex_attr_getters[flag_id] = flag_getter
@ -127,8 +122,9 @@ cdef class Vocab:
        cdef size_t addr
        if lex != NULL:
            if lex.orth != self.strings[string]:
-                raise LookupError.mismatched_strings(
-                    lex.orth, self.strings[string], string)
+                raise KeyError(Errors.E064.format(string=lex.orth,
+                                                  orth=self.strings[string],
+                                                  orth_id=string))
            return lex
        else:
            return self._new_lexeme(mem, string)
@ -171,7 +167,8 @@ cdef class Vocab:
        if not is_oov:
            key = hash_string(string)
            self._add_lex_to_vocab(key, lex)
-        assert lex != NULL, string
+        if lex == NULL:
+            raise ValueError(Errors.E085.format(string=string))
        return lex

    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -254,7 +251,7 @@ cdef class Vocab:
        width, you have to call this to change the size of the vectors.
        """
        if width is not None and shape is not None:
-            raise ValueError("Only one of width and shape can be specified")
+            raise ValueError(Errors.E065.format(width=width, shape=shape))
        elif shape is not None:
            self.vectors = Vectors(shape=shape)
        else:
@ -471,7 +468,10 @@ cdef class Vocab:
            if ptr == NULL:
                continue
            py_str = self.strings[lexeme.orth]
-            assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
+            if self.strings[py_str] != lexeme.orth:
+                raise ValueError(Errors.E086.format(string=py_str,
+                                                    orth_id=lexeme.orth,
+                                                    hash_id=self.strings[py_str]))
            key = hash_string(py_str)
            self._by_hash.set(key, lexeme)
            self._by_orth.set(lexeme.orth, lexeme)
@ -512,16 +512,3 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,


 copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
-
-
-class LookupError(Exception):
-    @classmethod
-    def mismatched_strings(cls, id_, id_string, original_string):
-        return cls(
-            "Error fetching a Lexeme from the Vocab. When looking up a "
-            "string, the lexeme returned had an orth ID that did not match "
-            "the query string. This means that the cached lexeme structs are "
-            "mismatched to the string encoding table. The mismatched:\n"
-            "Query string: {}\n"
-            "Orth cached: {}\n"
-            "Orth ID: {}".format(repr(original_string), repr(id_string), id_))