Merge branch 'master' of https://github.com/explosion/spaCy

2025-11-19 01:05:56 +03:00 · 2017-04-16 12:07:04 +02:00 · 2017-04-16 12:07:04 +02:00 · d7229967b0
commit d7229967b0
parent 28a7de8100 40e3024241
48 changed files with 888 additions and 765 deletions
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python
 '''Example of training a named entity recognition system from scratch using spaCy

 This example is written to be self-contained and reasonably transparent.
@ -81,7 +82,7 @@ def load_vocab(path):
 def init_ner_model(vocab, features=None):
    if features is None:
        features = tuple(EntityRecognizer.feature_templates)
-    return BeamEntityRecognizer(vocab, features=features)
+    return EntityRecognizer(vocab, features=features)


 def save_ner_model(model, path):
@ -99,7 +100,7 @@ def save_ner_model(model, path):


 def load_ner_model(vocab, path):
-    return BeamEntityRecognizer.load(path, vocab)
+    return EntityRecognizer.load(path, vocab)


 class Pipeline(object):
@ -110,18 +111,21 @@ class Pipeline(object):
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
-        vocab = load_vocab(path / 'vocab')
+        vocab = load_vocab(path)
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        ner_model = load_ner_model(vocab, path / 'ner')
        return cls(vocab, tokenizer, ner_model)

-    def __init__(self, vocab=None, tokenizer=None, ner_model=None):
+    def __init__(self, vocab=None, tokenizer=None, entity=None):
        if vocab is None:
-            self.vocab = init_vocab()
+            vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
-        if ner_model is None:
-            self.entity = init_ner_model(self.vocab)
+        if entity is None:
+            entity = init_ner_model(self.vocab)
+        self.vocab = vocab
+        self.tokenizer = tokenizer
+        self.entity = entity
        self.pipeline = [self.entity]

    def __call__(self, input_):
@ -173,7 +177,7 @@ class Pipeline(object):
        save_ner_model(self.entity, path / 'ner')


-def train(nlp, train_examples, dev_examples, nr_epoch=5):
+def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
    next_epoch = train_examples
    print("Iter", "Loss", "P", "R", "F")
    for i in range(nr_epoch):
@ -186,14 +190,17 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
                next_epoch.append((input_, annot))
        random.shuffle(next_epoch)
        scores = nlp.evaluate(dev_examples)
+        report_scores(i, loss, scores)
+    nlp.average_weights()
+    scores = nlp.evaluate(dev_examples)
+    report_scores(channels, i+1, loss, scores)
+
+
+def report_scores(i, loss, scores):
    precision = '%.2f' % scores['ents_p']
    recall = '%.2f' % scores['ents_r']
    f_measure = '%.2f' % scores['ents_f']
-        print(i, int(loss), precision, recall, f_measure)
-    nlp.average_weights()
-    scores = nlp.evaluate(dev_examples)
-    print("After averaging")
-    print(scores['ents_p'], scores['ents_r'], scores['ents_f'])
+    print('%d %s %s %s' % (int(loss), precision, recall, f_measure))


 def read_examples(path):
@ -221,15 +228,17 @@ def read_examples(path):
    train_loc=("Path to your training data", "positional", None, Path),
    dev_loc=("Path to your development data", "positional", None, Path),
 )
-def main(model_dir, train_loc, dev_loc, nr_epoch=10):
+def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
+        train_loc=None, dev_loc=None, nr_epoch=30):
+    
    train_examples = read_examples(train_loc)
    dev_examples = read_examples(dev_loc)
-    nlp = Pipeline()
+    nlp = Pipeline.load(model_dir)

-    train(nlp, train_examples, list(dev_examples), nr_epoch)
+    train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)

    nlp.save(model_dir)


 if __name__ == '__main__':
-    plac.call(main)
+    main()
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -0,0 +1,74 @@
+from __future__ import unicode_literals, print_function
+import json
+import pathlib
+import random
+
+import spacy
+from spacy.pipeline import EntityRecognizer
+from spacy.gold import GoldParse
+from spacy.tagger import Tagger
+
+ 
+try:
+    unicode
+except:
+    unicode = str
+
+
+def train_ner(nlp, train_data, output_dir):
+    # Add new words to vocab.
+    for raw_text, _ in train_data:
+        doc = nlp.make_doc(raw_text)
+        for word in doc:
+            _ = nlp.vocab[word.orth]
+
+    for itn in range(20):
+        random.shuffle(train_data)
+        for raw_text, entity_offsets in train_data:
+            gold = GoldParse(doc, entities=entity_offsets)
+            doc = nlp.make_doc(raw_text)
+            nlp.tagger(doc)
+            loss = nlp.entity.update(doc, gold)
+    nlp.end_training()
+    nlp.save_to_directory(output_dir)
+
+
+def main(model_name, output_directory=None):
+    nlp = spacy.load(model_name)
+
+    train_data = [
+        (
+            "Horses are too tall and they pretend to care about your feelings",
+            [(0, 6, 'ANIMAL')],
+        ),
+        (
+            "horses are too tall and they pretend to care about your feelings",
+            [(0, 6, 'ANIMAL')]
+        ),
+        (
+            "horses pretend to care about your feelings",
+            [(0, 6, 'ANIMAL')]
+        ),
+        (
+            "they pretend to care about your feelings, those horses",
+            [(48, 54, 'ANIMAL')]
+        )
+    ]
+    nlp.entity.add_label('ANIMAL')
+    if output_directory is not None:
+        output_directory = pathlib.Path(output_directory)
+    ner = train_ner(nlp, train_data, output_directory)
+
+    doc = nlp('Do you like horses?')
+    for ent in doc.ents:
+        print(ent.label_, ent.text)
+    nlp2 = spacy.load('en', path=output_directory)
+    nlp2.entity.add_label('ANIMAL')
+    doc2 = nlp2('Do you like horses?')
+    for ent in doc2.ents:
+        print(ent.label_, ent.text)
+
+
+if __name__ == '__main__':
+    import plac
+    plac.call(main)
--- a/fabfile.py
+++ b/fabfile.py
@ -14,7 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
 def env(lang='python2.7'):
    if path.exists(VENV_DIR):
        local('rm -rf {env}'.format(env=VENV_DIR))
-    local('virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
+    local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))


 def install():
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,27 +1,13 @@
 # coding: utf8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals

-import json
 from pathlib import Path
+
 from .util import set_lang_class, get_lang_class, parse_package_meta
 from .deprecated import resolve_model_name
 from .cli import info

-from . import en
-from . import de
-from . import zh
-from . import es
-from . import it
-from . import hu
-from . import fr
-from . import pt
-from . import nl
-from . import sv
-from . import fi
-from . import bn
-from . import he
-
-from .about import *
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he


 set_lang_class(en.English.lang, en.English)
--- a/spacy/main.py
+++ b/spacy/main.py
@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert


 class CLI(object):
-    """Command-line interface for spaCy"""
-
+    """
+    Command-line interface for spaCy
+    """
    commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')

    @plac.annotations(
@ -29,7 +30,6 @@ class CLI(object):
        can be shortcut, model name or, if --direct flag is set, full model name
        with version.
        """
-
        cli_download(model, direct)


@ -44,7 +44,6 @@ class CLI(object):
        either the name of a pip package, or the local path to the model data
        directory. Linking models allows loading them via spacy.load(link_name).
        """
-
        cli_link(origin, link_name, force)


@ -58,7 +57,6 @@ class CLI(object):
        speficied as an argument, print model information. Flag --markdown
        prints details in Markdown for easy copy-pasting to GitHub issues.
        """
-
        cli_info(model, markdown)


@ -73,7 +71,6 @@ class CLI(object):
        installation files. A new directory will be created in the specified
        output directory, and model data will be copied over.
        """
-
        cli_package(input_dir, output_dir, force)


@ -93,7 +90,6 @@ class CLI(object):
        """
        Train a model. Expects data in spaCy's JSON format.
        """
-
        cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
                  not no_parser, not no_ner, parser_L1)

@ -108,7 +104,6 @@ class CLI(object):
        """
        Initialize a new model and its data directory.
        """
-
        cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)

    @plac.annotations(
@ -122,7 +117,6 @@ class CLI(object):
        Convert files into JSON format for use with train command and other
        experiment management functions.
        """
-
        cli_convert(input_file, output_dir, n_sents, morphology)


--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,3 +1,7 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
 IDS = {
    "": NULL_ATTR,
    "IS_ALPHA": IS_ALPHA,
@ -92,7 +96,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]


 def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
-    '''Normalize a dictionary of attributes, converting them to ints.
+    """
+    Normalize a dictionary of attributes, converting them to ints.

    Arguments:
        stringy_attrs (dict):
@ -105,7 +110,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        inty_attrs (dict):
            Attributes dictionary with keys and optionally values converted to
            ints.
-    '''
+    """
    inty_attrs = {}
    if _do_deprecated:
        if 'F' in stringy_attrs:
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@ -1,3 +1,6 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from libc.stdio cimport fopen, fclose, fread, fwrite
 from libc.string cimport memcpy

--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import io
 from pathlib import Path

 from .converters import conllu2json
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -2,12 +2,12 @@
 from __future__ import unicode_literals

 import json
-from ...gold import read_json_file, merge_sents
 from ... import util


 def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
-    """Convert conllu files into JSON format for use with train cli.
+    """
+    Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.
    """
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pip
 import requests
 import os
 import subprocess
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -18,7 +18,6 @@ def info(model=None, markdown=False):
        else:
            data['source'] = str(model_path)
        print_info(data, "model " + model, markdown)
-
    else:
        data = get_spacy_data()
        print_info(data, "spaCy", markdown)
@ -26,10 +25,8 @@ def info(model=None, markdown=False):

 def print_info(data, title, markdown):
    title = "Info about {title}".format(title=title)
-
    if markdown:
        util.print_markdown(data, title=title)
-
    else:
        util.print_table(data, title=title)

--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import pip
 from pathlib import Path
 import importlib
+from ..compat import unicode_, symlink_to
 from .. import util


@ -20,7 +21,6 @@ def link_package(package_name, link_name, force=False):
    # Python's installation and import rules are very complicated.
    pkg = importlib.import_module(package_name)
    package_path = Path(pkg.__file__).parent.parent
-
    meta = get_meta(package_path, package_name)
    model_name = package_name + '-' + meta['version']
    model_path = package_path / package_name / model_name
@ -43,23 +43,17 @@ def symlink(model_path, link_name, force):
    elif link_path.exists():
        link_path.unlink()

-    # Add workaround for Python 2 on Windows (see issue #909)
-    if util.is_python2() and util.is_windows():
-        import subprocess
-        command = ['mklink', '/d', unicode(link_path), unicode(model_path)]
    try:
-            subprocess.call(command, shell=True)
+        symlink_to(link_path, model_path)
    except:
-            # This is quite dirty, but just making sure other Windows-specific
-            # errors are caught so users at least see a proper error message.
+        # This is quite dirty, but just making sure other errors are caught so
+        # users at least see a proper message.
        util.sys_exit(
            "Creating a symlink in spacy/data failed. You can still import "
            "the model as a Python package and call its load() method, or "
            "create the symlink manually:",
-                "{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)),
+            "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
            title="Error: Couldn't link model to '{l}'".format(l=link_name))
-    else:
-        link_path.symlink_to(model_path)

    util.print_msg(
        "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,20 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals

-import json
 import shutil
 import requests
 from pathlib import Path

-import six
-
-from .. import about
+from ..compat import unicode_, json_dumps
 from .. import util

-if six.PY2:
-    json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8")
-elif six.PY3:
-    json_dumps = lambda data: json.dumps(data, indent=2)

 def package(input_dir, output_dir, force):
    input_path = Path(input_dir)
@ -32,31 +25,31 @@ def package(input_dir, output_dir, force):
    package_path = main_path / model_name

    create_dirs(package_path, force)
-    shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
+    shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v))
    create_file(main_path / 'meta.json', json_dumps(meta))
    create_file(main_path / 'setup.py', template_setup)
    create_file(main_path / 'MANIFEST.in', template_manifest)
    create_file(package_path / '__init__.py', template_init)

    util.print_msg(
-        main_path.as_posix(),
+        unicode_(main_path),
        "To build the package, run `python setup.py sdist` in that directory.",
        title="Successfully created package {p}".format(p=model_name_v))


 def check_dirs(input_path, output_path):
    if not input_path.exists():
-        util.sys_exit(input_path.as_poisx(), title="Model directory not found")
+        util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
    if not output_path.exists():
-        util.sys_exit(output_path.as_posix(), title="Output directory not found")
+        util.sys_exit(unicode_(output_path), title="Output directory not found")


 def create_dirs(package_path, force):
    if package_path.exists():
        if force:
-            shutil.rmtree(package_path.as_posix())
+            shutil.rmtree(unicode_(package_path.as_posix))
        else:
-            util.sys_exit(package_path.as_posix(),
+            util.sys_exit(unicode_(package_path.as_posix),
                "Please delete the directory and try again.",
                title="Package directory already exists")
    Path.mkdir(package_path, parents=True)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -5,8 +5,6 @@ import json
 from pathlib import Path

 from ..scorer import Scorer
-from ..tagger import Tagger
-from ..syntax.parser import Parser
 from ..gold import GoldParse, merge_sents
 from ..gold import read_json_file as read_gold_json
 from .. import util
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")

    with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
-        loss = 0
        for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
            for doc, gold in epoch:
                trainer.update(doc, gold)
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -0,0 +1,54 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import six
+import sys
+import ujson
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+try:
+    import copy_reg
+except ImportError:
+    import copyreg as copy_reg
+
+
+is_python2 = six.PY2
+is_python3 = six.PY3
+is_windows = sys.platform.startswith('win')
+is_linux = sys.platform.startswith('linux')
+is_osx = sys.platform == 'darwin'
+
+
+if is_python2:
+    bytes_ = str
+    unicode_ = unicode
+    basestring_ = basestring
+    input_ = raw_input
+    json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
+
+elif is_python3:
+    bytes_ = bytes
+    unicode_ = str
+    basestring_ = str
+    input_ = input
+    json_dumps = lambda data: ujson.dumps(data, indent=2)
+
+
+def symlink_to(orig, dest):
+    if is_python2 and is_windows:
+        import subprocess
+        subprocess.call(['mklink', '/d', unicode(orig), unicode(dest)], shell=True)
+    else:
+        orig.symlink_to(dest)
+
+
+def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
+    return ((python2 == None or python2 == is_python2) and
+            (python3 == None or python3 == is_python3) and
+            (windows == None or windows == is_windows) and
+            (linux == None or linux == is_linux) and
+            (osx == None or osx == is_osx))
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -1,16 +1,14 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from pathlib import Path
+
 from . import about
 from . import util
 from .cli import download
 from .cli import link


-try:
-    basestring
-except NameError:
-    basestring = str
-
-
 def read_lang_data(package):
    tokenization = package.load_json(('tokenizer', 'specials.json'))
    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
@ -36,7 +34,8 @@ def align_tokens(ref, indices): # Deprecated, surely?


 def detokenize(token_rules, words): # Deprecated?
-    """To align with treebanks, return a list of "chunks", where a chunk is a
+    """
+    To align with treebanks, return a list of "chunks", where a chunk is a
    sequence of tokens that are separated by whitespace in actual strings. Each
    chunk should be a tuple of token indices, e.g.

@ -57,10 +56,30 @@ def detokenize(token_rules, words): # Deprecated?
    return positions


-def fix_glove_vectors_loading(overrides):
-    """Special-case hack for loading the GloVe vectors, to support deprecated
-    <1.0 stuff. Phase this out once the data is fixed."""
+def match_best_version(target_name, target_version, path):
+    path = util.ensure_path(path)
+    if path is None or not path.exists():
+        return None
+    matches = []
+    for data_name in path.iterdir():
+        name, version = split_data_name(data_name.parts[-1])
+        if name == target_name:
+            matches.append((tuple(float(v) for v in version.split('.')), data_name))
+    if matches:
+        return Path(max(matches)[1])
+    else:
+        return None

+
+def split_data_name(name):
+    return name.split('-', 1) if '-' in name else (name, '')
+
+
+def fix_glove_vectors_loading(overrides):
+    """
+    Special-case hack for loading the GloVe vectors, to support deprecated
+    <1.0 stuff. Phase this out once the data is fixed.
+    """
    if 'data_dir' in overrides and 'path' not in overrides:
        raise ValueError("The argument 'data_dir' has been renamed to 'path'")
    if overrides.get('path') is False:
@ -68,18 +87,16 @@ def fix_glove_vectors_loading(overrides):
    if overrides.get('path') in (None, True):
        data_path = util.get_data_path()
    else:
-        path = overrides['path']
-        if isinstance(path, basestring):
-            path = Path(path)
+        path = util.ensure_path(overrides['path'])
        data_path = path.parent
    vec_path = None
    if 'add_vectors' not in overrides:
        if 'vectors' in overrides:
-            vec_path = util.match_best_version(overrides['vectors'], None, data_path)
+            vec_path = match_best_version(overrides['vectors'], None, data_path)
            if vec_path is None:
                return overrides
        else:
-            vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
+            vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
        if vec_path is not None:
            vec_path = vec_path / 'vocab' / 'vec.bin'
    if vec_path is not None:
@ -88,13 +105,13 @@ def fix_glove_vectors_loading(overrides):


 def resolve_model_name(name):
-    """If spaCy is loaded with 'de', check if symlink already exists. If
+    """
+    If spaCy is loaded with 'de', check if symlink already exists. If
    not, user have upgraded from older version and have old models installed.
    Check if old model directory exists and if so, return that instead and create
    shortcut link. If English model is found and no shortcut exists, raise error
    and tell user to install new model.
    """
-
    if name == 'en' or name == 'de':
        versions = ['1.0.0', '1.1.0']
        data_path = Path(util.get_data_path())
@ -117,9 +134,11 @@ def resolve_model_name(name):


 class ModelDownload():
-    """Replace download modules within en and de with deprecation warning and
+    """
+    Replace download modules within en and de with deprecation warning and
    download default language model (using shortcut). Use classmethods to allow
-    importing ModelDownload as download and calling download.en() etc."""
+    importing ModelDownload as download and calling download.en() etc.
+    """

    @classmethod
    def load(self, lang):
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
 from .language_data import *


-try:
-    basestring
-except NameError:
-    basestring = str
-
-
 class English(Language):
    lang = 'en'

--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -1,13 +1,11 @@
 # cython: profile=True
+# coding: utf8
 from __future__ import unicode_literals, print_function

 import io
-import json
 import re
-import os
-from os import path
-
-import ujson as json
+import ujson
+from pathlib import Path

 from .syntax import nonproj

@ -141,12 +139,13 @@ def _min_edit_path(cand_words, gold_words):


 def read_json_file(loc, docs_filter=None):
-    if path.isdir(loc):
-        for filename in os.listdir(loc):
-            yield from read_json_file(path.join(loc, filename))
+    loc = Path(loc)
+    if loc.is_dir():
+        for filename in loc.iterdir():
+            yield from read_json_file(loc / filename)
    else:
        with io.open(loc, 'r', encoding='utf8') as file_:
-            docs = json.load(file_)
+            docs = ujson.load(file_)
        for doc in docs:
            if docs_filter is not None and not docs_filter(doc):
                continue
@ -220,7 +219,8 @@ cdef class GoldParse:

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False):
-        """Create a GoldParse.
+        """
+        Create a GoldParse.

        Arguments:
            doc (Doc):
@ -302,7 +302,8 @@ cdef class GoldParse:
            self.heads = proj_heads

    def __len__(self):
-        """Get the number of gold-standard tokens.
+        """
+        Get the number of gold-standard tokens.

        Returns (int): The number of gold-standard tokens.
        """
@ -310,13 +311,16 @@ cdef class GoldParse:

    @property
    def is_projective(self):
-        """Whether the provided syntactic annotations form a projective dependency
-        tree."""
+        """
+        Whether the provided syntactic annotations form a projective dependency
+        tree.
+        """
        return not nonproj.is_nonproj_tree(self.heads)


 def biluo_tags_from_offsets(doc, entities):
-    '''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
+    """
+    Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
    scheme (biluo).

    Arguments:
@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities):
        tags = biluo_tags_from_offsets(doc, entities)

        assert tags == ['O', 'O', 'U-LOC', 'O']
-    '''
+    """
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
    biluo = ['-' for _ in doc]
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,39 +1,26 @@
-from __future__ import absolute_import
-from __future__ import unicode_literals
-import pathlib
+# coding: utf8
+from __future__ import absolute_import, unicode_literals
 from contextlib import contextmanager
 import shutil
-
 import ujson

-
-try:
-    basestring
-except NameError:
-    basestring = str
-
-try:
-    unicode
-except NameError:
-    unicode = str
-
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .tagger import Tagger
 from .matcher import Matcher
-from . import attrs
-from . import orth
-from . import util
-from . import language_data
 from .lemmatizer import Lemmatizer
 from .train import Trainer
-
-from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
 from .syntax.parser import get_templates
 from .syntax.nonproj import PseudoProjectivity
 from .pipeline import DependencyParser, EntityRecognizer
 from .syntax.arc_eager import ArcEager
 from .syntax.ner import BiluoPushDown
+from .compat import unicode_
+from .attrs import IS_STOP
+from . import attrs
+from . import orth
+from . import util
+from . import language_data


 class BaseDefaults(object):
@ -150,25 +137,15 @@ class BaseDefaults(object):
        return pipeline

    token_match = language_data.TOKEN_MATCH
-
    prefixes = tuple(language_data.TOKENIZER_PREFIXES)
-
    suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
-
    infixes = tuple(language_data.TOKENIZER_INFIXES)
-
    tag_map = dict(language_data.TAG_MAP)
-
    tokenizer_exceptions = {}
-
    parser_features = get_templates('parser')
-
    entity_features = get_templates('ner')
-
    tagger_features = Tagger.feature_templates # TODO -- fix this
-
    stop_words = set()
-
    lemma_rules = {}
    lemma_exc = {}
    lemma_index = {}
@ -202,53 +179,42 @@ class BaseDefaults(object):


 class Language(object):
-    '''A text-processing pipeline. Usually you'll load this once per process, and
+    """
+    A text-processing pipeline. Usually you'll load this once per process, and
    pass the instance around your program.
-    '''
+    """
    Defaults = BaseDefaults
    lang = None

    @classmethod
-    @contextmanager
-    def train(cls, path, gold_tuples, *configs):
-        if isinstance(path, basestring):
-            path = pathlib.Path(path)
-        tagger_cfg, parser_cfg, entity_cfg = configs
-        dep_model_dir = path / 'deps'
-        ner_model_dir = path / 'ner'
-        pos_model_dir = path / 'pos'
-        if dep_model_dir.exists():
-            shutil.rmtree(str(dep_model_dir))
-        if ner_model_dir.exists():
-            shutil.rmtree(str(ner_model_dir))
-        if pos_model_dir.exists():
-            shutil.rmtree(str(pos_model_dir))
-        dep_model_dir.mkdir()
-        ner_model_dir.mkdir()
-        pos_model_dir.mkdir()
+    def setup_directory(cls, path, **configs):
+        for name, config in configs.items():
+            directory = path / name
+            if directory.exists():
+                shutil.rmtree(str(directory))
+            directory.mkdir()
+            with (directory / 'config.json').open('wb') as file_:
+                data = ujson.dumps(config, indent=2)
+                if isinstance(data, unicode_):
+                    data = data.encode('utf8')
+                file_.write(data)
+        if not (path / 'vocab').exists():
+            (path / 'vocab').mkdir()

+    @classmethod
+    @contextmanager
+    def train(cls, path, gold_tuples, **configs):
        if parser_cfg['pseudoprojective']:
            # preprocess training data here before ArcEager.get_labels() is called
            gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

-        parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
-        entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
+        for subdir in ('deps', 'ner', 'pos'):
+            if subdir not in configs:
+                configs[subdir] = {}
+        configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
+        configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)

-        with (dep_model_dir / 'config.json').open('wb') as file_:
-            data = ujson.dumps(parser_cfg)
-            if isinstance(data, unicode):
-                data = data.encode('utf8')
-            file_.write(data)
-        with (ner_model_dir / 'config.json').open('wb') as file_:
-            data = ujson.dumps(entity_cfg)
-            if isinstance(data, unicode):
-                data = data.encode('utf8')
-            file_.write(data)
-        with (pos_model_dir / 'config.json').open('wb') as file_:
-            data = ujson.dumps(tagger_cfg)
-            if isinstance(data, unicode):
-                data = data.encode('utf8')
-            file_.write(data)
+        cls.setup_directory(path, **configs)

        self = cls(
                path=path,
@ -269,14 +235,14 @@ class Language(object):
        self.entity = self.Defaults.create_entity(self)
        self.pipeline = self.Defaults.create_pipeline(self)
        yield Trainer(self, gold_tuples)
-        self.end_training(path=path)
+        self.end_training()
+        self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg,
+                               pos=self.tagger.cfg)

    def __init__(self, **overrides):
        if 'data_dir' in overrides and 'path' not in overrides:
            raise ValueError("The argument 'data_dir' has been renamed to 'path'")
-        path = overrides.get('path', True)
-        if isinstance(path, basestring):
-            path = pathlib.Path(path)
+        path = util.ensure_path(overrides.get('path', True))
        if path is True:
            path = util.get_data_path() / self.lang
            if not path.exists() and 'path' not in overrides:
@ -322,7 +288,8 @@ class Language(object):
            self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]

    def __call__(self, text, tag=True, parse=True, entity=True):
-        """Apply the pipeline to some text.  The text can span multiple sentences,
+        """
+        Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
        is preserved.

@ -352,7 +319,8 @@ class Language(object):
        return doc

    def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
-        '''Process texts as a stream, and yield Doc objects in order.
+        """
+        Process texts as a stream, and yield Doc objects in order.

        Supports GIL-free multi-threading.

@ -361,7 +329,7 @@ class Language(object):
            tag (bool)
            parse (bool)
            entity (bool)
-        '''
+        """
        skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
        stream = (self.make_doc(text) for text in texts)
        for proc in self.pipeline:
@ -373,51 +341,35 @@ class Language(object):
        for doc in stream:
            yield doc

-    def end_training(self, path=None):
-        if path is None:
-            path = self.path
-        elif isinstance(path, basestring):
-            path = pathlib.Path(path)
+    def save_to_directory(self, path):
+        configs = {
+            'pos': self.tagger.cfg if self.tagger else {},
+            'deps': self.parser.cfg if self.parser else {},
+            'ner': self.entity.cfg if self.entity else {},
+        }

-        if self.tagger:
-            self.tagger.model.end_training()
-            self.tagger.model.dump(str(path / 'pos' / 'model'))
-        if self.parser:
-            self.parser.model.end_training()
-            self.parser.model.dump(str(path / 'deps' / 'model'))
-        if self.entity:
-            self.entity.model.end_training()
-            self.entity.model.dump(str(path / 'ner' / 'model'))
+        self.setup_directory(path, **configs)

        strings_loc = path / 'vocab' / 'strings.json'
        with strings_loc.open('w', encoding='utf8') as file_:
            self.vocab.strings.dump(file_)
        self.vocab.dump(path / 'vocab' / 'lexemes.bin')
-
+        # TODO: Word vectors?
        if self.tagger:
-            tagger_freqs = list(self.tagger.freqs[TAG].items())
-        else:
-            tagger_freqs = []
+            self.tagger.model.dump(str(path / 'pos' / 'model'))
        if self.parser:
-            dep_freqs = list(self.parser.moves.freqs[DEP].items())
-            head_freqs = list(self.parser.moves.freqs[HEAD].items())
-        else:
-            dep_freqs = []
-            head_freqs = []
+            self.parser.model.dump(str(path / 'deps' / 'model'))
        if self.entity:
-            entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items())
-            entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items())
-        else:
-            entity_iob_freqs = []
-            entity_type_freqs = []
-        with (path / 'vocab' / 'serializer.json').open('wb') as file_:
-            data = ujson.dumps([
-                        (TAG, tagger_freqs),
-                        (DEP, dep_freqs),
-                        (ENT_IOB, entity_iob_freqs),
-                        (ENT_TYPE, entity_type_freqs),
-                        (HEAD, head_freqs)
-                    ])
-            if isinstance(data, unicode):
-                data = data.encode('utf8')
-            file_.write(data)
+            self.entity.model.dump(str(path / 'ner' / 'model'))
+
+    def end_training(self, path=None):
+        if self.tagger:
+            self.tagger.model.end_training()
+        if self.parser:
+            self.parser.model.end_training()
+        if self.entity:
+            self.entity.model.end_training()
+        # NB: This is slightly different from before --- we no longer default
+        # to taking nlp.path
+        if path is not None:
+            self.save_to_directory(path)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,13 +1,8 @@
-from __future__ import unicode_literals, print_function
-import codecs
-import pathlib
-
-import ujson as json
+# coding: utf8
+from __future__ import unicode_literals

 from .symbols import POS, NOUN, VERB, ADJ, PUNCT
-from .symbols import VerbForm_inf, VerbForm_none
-from .symbols import Number_sing
-from .symbols import Degree_pos
+from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos


 class Lemmatizer(object):
@ -38,8 +33,10 @@ class Lemmatizer(object):
        return lemmas

    def is_base_form(self, univ_pos, morphology=None):
-        '''Check whether we're dealing with an uninflected paradigm, so we can
-        avoid lemmatization entirely.'''
+        """
+        Check whether we're dealing with an uninflected paradigm, so we can
+        avoid lemmatization entirely.
+        """
        morphology = {} if morphology is None else morphology
        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
        true_morph_key = morphology.get('morph', 0)
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,4 +1,7 @@
 # cython: embedsignature=True
+# coding: utf8
+from __future__ import unicode_literals, print_function
+
 from libc.math cimport sqrt
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
 cimport numpy as np
 np.import_array()

-
-
 from libc.string cimport memset
+import numpy

 from .orth cimport word_shape
 from .typedefs cimport attr_t, flags_t
-import numpy
-
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from .attrs cimport IS_BRACKET
@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))


 cdef class Lexeme:
-    """An entry in the vocabulary.  A Lexeme has no string context --- it's a
+    """
+    An entry in the vocabulary.  A Lexeme has no string context --- it's a
    word-type, as opposed to a word token.  It therefore has no part-of-speech
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
    def __init__(self, Vocab vocab, int orth):
-        """Create a Lexeme object.
+        """
+        Create a Lexeme object.

        Arguments:
            vocab (Vocab): The parent vocabulary
@ -80,7 +82,8 @@ cdef class Lexeme:
        return self.c.orth

    def set_flag(self, attr_id_t flag_id, bint value):
-        """Change the value of a boolean flag.
+        """
+        Change the value of a boolean flag.

        Arguments:
            flag_id (int): The attribute ID of the flag to set.
@ -89,7 +92,8 @@ cdef class Lexeme:
        Lexeme.c_set_flag(self.c, flag_id, value)

    def check_flag(self, attr_id_t flag_id):
-        """Check the value of a boolean flag.
+        """
+        Check the value of a boolean flag.

        Arguments:
            flag_id (int): The attribute ID of the flag to query.
@ -98,7 +102,8 @@ cdef class Lexeme:
        return True if Lexeme.c_check_flag(self.c, flag_id) else False

    def similarity(self, other):
-        '''Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        """
+        Compute a semantic similarity estimate. Defaults to cosine over vectors.

        Arguments:
            other:
@ -106,7 +111,7 @@ cdef class Lexeme:
                Token and Lexeme objects.
        Returns:
            score (float): A scalar similarity score. Higher is more similar.
-        '''
+        """
        if self.vector_norm == 0 or other.vector_norm == 0:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,7 +1,10 @@
 # cython: profile=True
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals

+import ujson
+
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from .attrs cimport attr_id_t
@ -52,12 +55,6 @@ from .attrs import FLAG36 as L9_ENT
 from .attrs import FLAG35 as L10_ENT


-try:
-    import ujson as json
-except ImportError:
-    import json
-
-
 cpdef enum quantifier_t:
    _META
    ONE
@ -180,7 +177,8 @@ cdef class Matcher:

    @classmethod
    def load(cls, path, vocab):
-        '''Load the matcher and patterns from a file path.
+        """
+        Load the matcher and patterns from a file path.

        Arguments:
            path (Path):
@ -189,16 +187,17 @@ cdef class Matcher:
                The vocabulary that the documents to match over will refer to.
        Returns:
            Matcher: The newly constructed object.
-        '''
+        """
        if (path / 'gazetteer.json').exists():
            with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
-                patterns = json.load(file_)
+                patterns = ujson.load(file_)
        else:
            patterns = {}
        return cls(vocab, patterns)

    def __init__(self, vocab, patterns={}):
-        """Create the Matcher.
+        """
+        Create the Matcher.

        Arguments:
            vocab (Vocab):
@ -227,7 +226,8 @@ cdef class Matcher:

    def add_entity(self, entity_key, attrs=None, if_exists='raise',
                   acceptor=None, on_match=None):
-        """Add an entity to the matcher.
+        """
+        Add an entity to the matcher.

        Arguments:
            entity_key (unicode or int):
@ -264,7 +264,8 @@ cdef class Matcher:
        self._callbacks[entity_key] = on_match

    def add_pattern(self, entity_key, token_specs, label=""):
-        """Add a pattern to the matcher.
+        """
+        Add a pattern to the matcher.

        Arguments:
            entity_key (unicode or int):
@ -307,7 +308,8 @@ cdef class Matcher:
            return entity_key

    def has_entity(self, entity_key):
-        """Check whether the matcher has an entity.
+        """
+        Check whether the matcher has an entity.

        Arguments:
            entity_key (string or int): The entity key to check.
@ -318,7 +320,8 @@ cdef class Matcher:
        return entity_key in self._entities

    def get_entity(self, entity_key):
-        """Retrieve the attributes stored for an entity.
+        """
+        Retrieve the attributes stored for an entity.

        Arguments:
            entity_key (unicode or int): The entity to retrieve.
@ -332,7 +335,8 @@ cdef class Matcher:
            return None

    def __call__(self, Doc doc, acceptor=None):
-        """Find all token sequences matching the supplied patterns on the Doc.
+        """
+        Find all token sequences matching the supplied patterns on the Doc.

        Arguments:
            doc (Doc):
@ -445,7 +449,8 @@ cdef class Matcher:
        return matches

    def pipe(self, docs, batch_size=1000, n_threads=2):
-        """Match a stream of documents, yielding them in turn.
+        """
+        Match a stream of documents, yielding them in turn.

        Arguments:
            docs: A stream of documents.
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,13 +1,9 @@
 # cython: infer_types
+# coding: utf8
 from __future__ import unicode_literals

 from libc.string cimport memset

-try:
-    import ujson as json
-except ImportError:
-    import json
-
 from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
@ -16,7 +12,9 @@ from .attrs import LEMMA, intify_attrs


 def _normalize_props(props):
-    '''Transform deprecated string keys to correct names.'''
+    """
+    Transform deprecated string keys to correct names.
+    """
    out = {}
    for key, value in props.items():
        if key == POS:
@ -98,13 +96,14 @@ cdef class Morphology:
            flags[0] &= ~(one << flag_id)

    def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
-        '''Add a special-case rule to the morphological analyser. Tokens whose
+        """
+        Add a special-case rule to the morphological analyser. Tokens whose
        tag and orth match the rule will receive the specified properties.

        Arguments:
            tag (unicode): The part-of-speech tag to key the exception.
            orth (unicode): The word-form to key the exception.
-        '''
+        """
        tag = self.strings[tag_str]
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
--- a/spacy/multi_words.py
+++ b/spacy/multi_words.py
@ -1,8 +0,0 @@
-class RegexMerger(object):
-    def __init__(self, regexes):
-        self.regexes = regexes
-
-    def __call__(self, tokens):
-        for tag, entity_type, regex in self.regexes:
-            for m in regex.finditer(tokens.string):
-                tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -1,6 +1,7 @@
-# coding: utf8
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals
+
 import unicodedata
 import re

--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -1,3 +1,4 @@
+# coding: utf8
 from __future__ import unicode_literals


--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -1,3 +1,6 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from .syntax.parser cimport Parser
 from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
@ -11,44 +14,40 @@ from .attrs import DEP, ENT_TYPE


 cdef class EntityRecognizer(Parser):
-    """Annotate named entities on Doc objects."""
+    """
+    Annotate named entities on Doc objects.
+    """
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')

    def add_label(self, label):
-        for action in self.moves.action_types:
-            self.moves.add_action(action, label)
-            if 'actions' in self.cfg:
-                self.cfg['actions'].setdefault(action,
-                                        {}).setdefault(label, True)
+        Parser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
+        # Set label into serializer. Super hacky :(
        for attr, freqs in self.vocab.serializer_freqs:
            if attr == ENT_TYPE and label not in freqs:
                freqs.append([label, 1])
-        # Super hacky :(
        self.vocab._serializer = None


 cdef class BeamEntityRecognizer(BeamParser):
-    """Annotate named entities on Doc objects."""
+    """
+    Annotate named entities on Doc objects.
+    """
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')

    def add_label(self, label):
-        for action in self.moves.action_types:
-            self.moves.add_action(action, label)
-            if 'actions' in self.cfg:
-                self.cfg['actions'].setdefault(action,
-                                        {}).setdefault(label, True)
+        Parser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
+        # Set label into serializer. Super hacky :(
        for attr, freqs in self.vocab.serializer_freqs:
            if attr == ENT_TYPE and label not in freqs:
                freqs.append([label, 1])
-        # Super hacky :(
        self.vocab._serializer = None


@ -58,11 +57,7 @@ cdef class DependencyParser(Parser):
    feature_templates = get_feature_templates('basic')

    def add_label(self, label):
-        for action in self.moves.action_types:
-            self.moves.add_action(action, label)
-            if 'actions' in self.cfg:
-                self.cfg['actions'].setdefault(action,
-                                        {}).setdefault(label, True)
+        Parser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        for attr, freqs in self.vocab.serializer_freqs:
@ -78,11 +73,7 @@ cdef class BeamDependencyParser(BeamParser):
    feature_templates = get_feature_templates('basic')

    def add_label(self, label):
-        for action in self.moves.action_types:
-            self.moves.add_action(action, label)
-            if 'actions' in self.cfg:
-                self.cfg['actions'].setdefault(action,
-                                        {}).setdefault(label, True)
+        Parser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
        for attr, freqs in self.vocab.serializer_freqs:
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,12 +1,13 @@
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+# coding: utf8
+from __future__ import division, print_function, unicode_literals

 from .gold import tags_to_entities


 class PRFScore(object):
-    """A precision / recall / F score"""
+    """
+    A precision / recall / F score
+    """
    def __init__(self):
        self.tp = 0
        self.fp = 0
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,12 +1,11 @@
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals, absolute_import

 cimport cython
 from libc.string cimport memcpy
 from libc.stdint cimport uint64_t, uint32_t
-
 from murmurhash.mrmr cimport hash64, hash32
-
 from preshed.maps cimport map_iter, key_t

 from .typedefs cimport hash_t
@ -73,13 +72,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex


 cdef class StringStore:
-    '''Map strings to and from integer IDs.'''
+    """
+    Map strings to and from integer IDs.
+    """
    def __init__(self, strings=None, freeze=False):
-        '''Create the StringStore.
+        """
+        Create the StringStore.

        Arguments:
            strings: A sequence of unicode strings to add to the store.
-        '''
+        """
        self.mem = Pool()
        self._map = PreshMap()
        self._oov = PreshMap()
@ -104,7 +106,8 @@ cdef class StringStore:
        return (StringStore, (list(self),))

    def __len__(self):
-        """The number of strings in the store.
+        """
+        The number of strings in the store.

        Returns:
            int The number of strings in the store.
@ -112,7 +115,8 @@ cdef class StringStore:
        return self.size-1

    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given integer ID, or vice versa.
+        """
+        Retrieve a string from a given integer ID, or vice versa.

        Arguments:
            string_or_id (bytes or unicode or int):
@ -159,7 +163,8 @@ cdef class StringStore:
                return utf8str - self.c

    def __contains__(self, unicode string not None):
-        """Check whether a string is in the store.
+        """
+        Check whether a string is in the store.

        Arguments:
            string (unicode): The string to check.
@ -172,7 +177,8 @@ cdef class StringStore:
        return self._map.get(key) is not NULL

    def __iter__(self):
-        """Iterate over the strings in the store, in order.
+        """
+        Iterate over the strings in the store, in order.

        Yields: unicode A string in the store.
        """
@ -230,7 +236,8 @@ cdef class StringStore:
        return &self.c[self.size-1]

    def dump(self, file_):
-        """Save the strings to a JSON file.
+        """
+        Save the strings to a JSON file.

        Arguments:
            file_ (buffer): The file to save the strings.
@ -244,7 +251,8 @@ cdef class StringStore:
        file_.write(string_data)

    def load(self, file_):
-        """Load the strings from a JSON file.
+        """
+        Load the strings from a JSON file.

        Arguments:
            file_ (buffer): The file from which to load the strings.
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -1,3 +1,4 @@
+# coding: utf8
 from __future__ import unicode_literals

 IDS = {
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx
 The atomic feature names are listed in a big enum, so that the feature tuples
 can refer to them.
 """
-from libc.string cimport memset
+# coding: utf-8
+from __future__ import unicode_literals

+from libc.string cimport memset
 from itertools import combinations
+from cymem.cymem cimport Pool

 from ..structs cimport TokenC
-
 from .stateclass cimport StateClass
 from ._state cimport StateC

-from cymem.cymem cimport Pool
-

 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
    if token is NULL:
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -1,29 +1,26 @@
 # cython: profile=True
 # cython: cdivision=True
 # cython: infer_types=True
+# coding: utf-8
 from __future__ import unicode_literals
+
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
-
 import ctypes
-import os
-
-from ..structs cimport TokenC
+from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+from cymem.cymem cimport Pool

+from .stateclass cimport StateClass
+from ._state cimport StateC, is_space_token
+from .nonproj import PseudoProjectivity
+from .nonproj import is_nonproj_tree
 from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
 from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
 from ..lexeme cimport Lexeme
-
-from libc.stdint cimport uint32_t
-from libc.string cimport memcpy
-
-from cymem.cymem cimport Pool
-from .stateclass cimport StateClass
-from ._state cimport StateC, is_space_token
-from .nonproj import PseudoProjectivity
-from .nonproj import is_nonproj_tree
+from ..structs cimport TokenC


 DEF NON_MONOTONIC = True
@ -317,17 +314,20 @@ cdef class ArcEager(TransitionSystem):
    def get_actions(cls, **kwargs):
        actions = kwargs.get('actions',
                    {
-                        SHIFT: {'': True},
-                        REDUCE: {'': True},
-                        RIGHT: {},
-                        LEFT: {},
-                        BREAK: {'ROOT': True}})
+                        SHIFT: [''],
+                        REDUCE: [''],
+                        RIGHT: [],
+                        LEFT: [],
+                        BREAK: ['ROOT']})
+        seen_actions = set()
        for label in kwargs.get('left_labels', []):
            if label.upper() != 'ROOT':
-                actions[LEFT][label] = True
+                if (LEFT, label) not in seen_actions:
+                    actions[LEFT].append(label)
        for label in kwargs.get('right_labels', []):
            if label.upper() != 'ROOT':
-                actions[RIGHT][label] = True
+                if (RIGHT, label) not in seen_actions:
+                    actions[RIGHT].append(label)

        for raw_text, sents in kwargs.get('gold_parses', []):
            for (ids, words, tags, heads, labels, iob), ctnts in sents:
@ -336,9 +336,11 @@ cdef class ArcEager(TransitionSystem):
                        label = 'ROOT'
                    if label != 'ROOT':
                        if head < child:
-                            actions[RIGHT][label] = True
+                            if (RIGHT, label) not in seen_actions:
+                                actions[RIGHT].append(label)
                        elif head > child:
-                            actions[LEFT][label] = True
+                            if (LEFT, label) not in seen_actions:
+                                actions[LEFT].append(label)
        return actions

    property action_types:
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -1,50 +1,34 @@
+"""
+MALT-style dependency parser
+"""
 # cython: profile=True
 # cython: experimental_cpp_class_def=True
 # cython: cdivision=True
 # cython: infer_types=True
-"""
-MALT-style dependency parser
-"""
-from __future__ import unicode_literals
+# coding: utf-8
+
+from __future__ import unicode_literals, print_function
 cimport cython

 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
-
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport rand
 from libc.math cimport log, exp, isnan, isinf
-import random
-import os.path
-from os import path
-import shutil
-import json
-import math
-
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport real_hash64 as hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
-
-
-from util import Config
-
 from thinc.linear.features cimport ConjunctionExtracter
 from thinc.structs cimport FeatureC, ExampleC
-
-from thinc.extra.search cimport Beam
-from thinc.extra.search cimport MaxViolation
+from thinc.extra.search cimport Beam, MaxViolation
 from thinc.extra.eg cimport Example
 from thinc.extra.mb cimport Minibatch

 from ..structs cimport TokenC
-
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
-
 from .transition_system cimport TransitionSystem, Transition
-
 from ..gold cimport GoldParse
-
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings):
        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
        truth.add((id_, head, dep))
    return truth == predicted
-
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -1,9 +1,14 @@
-from spacy.parts_of_speech cimport NOUN, PROPN, PRON
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ..parts_of_speech cimport NOUN, PROPN, PRON


 def english_noun_chunks(obj):
-    '''Detect base noun phrases from a dependency parse.
-    Works on both Doc and Span.'''
+    """
+    Detect base noun phrases from a dependency parse.
+    Works on both Doc and Span.
+    """
    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
              'attr', 'ROOT', 'root']
    doc = obj.doc # Ensure works on both Doc and Span.
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -1,17 +1,16 @@
+# coding: utf-8
 from __future__ import unicode_literals

-from .transition_system cimport Transition
-from .transition_system cimport do_func_t
-
-from ..structs cimport TokenC, Entity
-
 from thinc.typedefs cimport weight_t
-from ..gold cimport GoldParseC
-from ..gold cimport GoldParse
-from ..attrs cimport ENT_TYPE, ENT_IOB

 from .stateclass cimport StateClass
 from ._state cimport StateC
+from .transition_system cimport Transition
+from .transition_system cimport do_func_t
+from ..structs cimport TokenC, Entity
+from ..gold cimport GoldParseC
+from ..gold cimport GoldParse
+from ..attrs cimport ENT_TYPE, ENT_IOB


 cdef enum:
@ -21,6 +20,7 @@ cdef enum:
    LAST
    UNIT
    OUT
+    ISNT
    N_MOVES


@ -31,6 +31,7 @@ MOVE_NAMES[IN] = 'I'
 MOVE_NAMES[LAST] = 'L'
 MOVE_NAMES[UNIT] = 'U'
 MOVE_NAMES[OUT] = 'O'
+MOVE_NAMES[ISNT] = 'x'


 cdef do_func_t[N_MOVES] do_funcs
@ -54,16 +55,20 @@ cdef class BiluoPushDown(TransitionSystem):
    def get_actions(cls, **kwargs):
        actions = kwargs.get('actions',
                    {
-                        MISSING: {'': True},
-                        BEGIN: {},
-                        IN: {},
-                        LAST: {},
-                        UNIT: {},
-                        OUT: {'': True}
+                        MISSING: [''],
+                        BEGIN: [],
+                        IN: [],
+                        LAST: [],
+                        UNIT: [],
+                        OUT: ['']
                    })
+        seen_entities = set()
        for entity_type in kwargs.get('entity_types', []):
+            if entity_type in seen_entities:
+                continue
+            seen_entities.add(entity_type)
            for action in (BEGIN, IN, LAST, UNIT):
-                actions[action][entity_type] = True
+                actions[action].append(entity_type)
        moves = ('M', 'B', 'I', 'L', 'U')
        for raw_text, sents in kwargs.get('gold_parses', []):
            for (ids, words, tags, heads, labels, biluo), _ in sents:
@ -72,8 +77,10 @@ cdef class BiluoPushDown(TransitionSystem):
                        if ner_tag.count('-') != 1:
                            raise ValueError(ner_tag)
                        _, label = ner_tag.split('-')
+                        if label not in seen_entities:
+                            seen_entities.add(label)
                            for move_str in ('B', 'I', 'L', 'U'):
-                            actions[moves.index(move_str)][label] = True
+                                actions[moves.index(move_str)].append(label)
        return actions

    property action_types:
@ -111,11 +118,17 @@ cdef class BiluoPushDown(TransitionSystem):
            label = 0
        elif '-' in name:
            move_str, label_str = name.split('-', 1)
+            # Hacky way to denote 'not this entity'
+            if label_str.startswith('!'):
+                label_str = label_str[1:]
+                move_str = 'x'
            label = self.strings[label_str]
        else:
            move_str = name
            label = 0
        move = MOVE_NAMES.index(move_str)
+        if move == ISNT:
+            return Transition(clas=0, move=ISNT, label=label, score=0)
        for i in range(self.n_moves):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
@ -225,6 +238,9 @@ cdef class Begin:
        elif g_act == BEGIN:
            # B, Gold B --> Label match
            return label != g_tag
+        # Support partial supervision in the form of "not this label"
+        elif g_act == ISNT:
+            return label == g_tag
        else:
            # B, Gold I --> False (P)
            # B, Gold L --> False (P)
@ -359,6 +375,9 @@ cdef class Unit:
        elif g_act == UNIT:
            # U, Gold U --> True iff tag match
            return label != g_tag
+        # Support partial supervision in the form of "not this label"
+        elif g_act == ISNT:
+            return label == g_tag
        else:
            # U, Gold B --> False
            # U, Gold I --> False
@ -388,7 +407,7 @@ cdef class Out:
        cdef int g_act = gold.ner[s.B(0)].move
        cdef int g_tag = gold.ner[s.B(0)].label

-        if g_act == MISSING:
+        if g_act == MISSING or g_act == ISNT:
            return 0
        elif g_act == BEGIN:
            # O, Gold B --> False
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -1,8 +1,9 @@
+# coding: utf-8
 from __future__ import unicode_literals
 from copy import copy

 from ..tokens.doc cimport Doc
-from spacy.attrs import DEP, HEAD
+from ..attrs import DEP, HEAD


 def ancestors(tokenid, heads):
@ -201,5 +202,3 @@ class PseudoProjectivity:
                filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
            filtered.append((raw_text, filtered_sents))
        return filtered
-
-
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -1,58 +1,46 @@
-# cython: infer_types=True
 """
 MALT-style dependency parser
 """
+# coding: utf-8
+# cython: infer_types=True
 from __future__ import unicode_literals
+
+from collections import Counter
+import ujson
+
 cimport cython
 cimport cython.parallel

 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
-
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport malloc, calloc, free
-
-import os.path
-from collections import Counter
-from os import path
-import shutil
-import json
-import sys
-from .nonproj import PseudoProjectivity
-
-from cymem.cymem cimport Pool, Address
-from murmurhash.mrmr cimport hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
-from thinc.structs cimport SparseArrayC
+from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
+from thinc.extra.eg cimport Example
+from cymem.cymem cimport Pool, Address
+from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get

-from thinc.structs cimport FeatureC
-from thinc.structs cimport ExampleC
-from thinc.extra.eg cimport Example
-
-from util import Config
-
-from ..structs cimport TokenC
-
-from ..tokens.doc cimport Doc
-from ..strings cimport StringStore
-
-from .transition_system import OracleError
-from .transition_system cimport TransitionSystem, Transition
-
-from ..gold cimport GoldParse
-
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
+from .nonproj import PseudoProjectivity
+from .transition_system import OracleError
+from .transition_system cimport TransitionSystem, Transition
+from ..structs cimport TokenC
+from ..tokens.doc cimport Doc
+from ..strings cimport StringStore
+from ..gold cimport GoldParse

-USE_FTRL = True
+
+USE_FTRL = False
 DEBUG = False
 def set_debug(val):
    global DEBUG
@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron):
        return nr_feat

    def update(self, Example eg, itn=0):
-        '''Does regression on negative cost. Sort of cute?'''
+        """
+        Does regression on negative cost. Sort of cute?
+        """
        self.time += 1
        cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
        cdef int guess = eg.guess
@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron):


 cdef class Parser:
-    """Base class of the DependencyParser and EntityRecognizer."""
+    """
+    Base class of the DependencyParser and EntityRecognizer.
+    """
    @classmethod
    def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
-        """Load the statistical model from the supplied path.
+        """
+        Load the statistical model from the supplied path.

        Arguments:
            path (Path):
@ -148,10 +141,16 @@ cdef class Parser:
            The newly constructed object.
        """
        with (path / 'config.json').open() as file_:
-            cfg = json.load(file_)
+            cfg = ujson.load(file_)
        # TODO: remove this shim when we don't have to support older data
        if 'labels' in cfg and 'actions' not in cfg:
            cfg['actions'] = cfg.pop('labels')
+        # TODO: remove this shim when we don't have to support older data
+        for action_name, labels in dict(cfg['actions']).items():
+            # We need this to be sorted
+            if isinstance(labels, dict):
+                labels = list(sorted(labels.keys()))
+            cfg['actions'][action_name] = labels
        self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
        if (path / 'model').exists():
            self.model.load(str(path / 'model'))
@ -161,7 +160,8 @@ cdef class Parser:
        return self

    def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
-        """Create a Parser.
+        """
+        Create a Parser.

        Arguments:
            vocab (Vocab):
@ -186,12 +186,18 @@ cdef class Parser:
        self.model.learn_rate = cfg.get('learn_rate', 0.001)

        self.cfg = cfg
+        # TODO: This is a pretty hacky fix to the problem of adding more
+        # labels. The issue is they come in out of order, if labels are
+        # added during training
+        for label in cfg.get('extra_labels', []):
+            self.add_label(label)

    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)

    def __call__(self, Doc tokens):
-        """Apply the entity recognizer, setting the annotations onto the Doc object.
+        """
+        Apply the entity recognizer, setting the annotations onto the Doc object.

        Arguments:
            doc (Doc): The document to be processed.
@ -208,7 +214,8 @@ cdef class Parser:
        self.moves.finalize_doc(tokens)

    def pipe(self, stream, int batch_size=1000, int n_threads=2):
-        """Process a stream of documents.
+        """
+        Process a stream of documents.

        Arguments:
            stream: The sequence of documents to process.
@ -296,7 +303,8 @@ cdef class Parser:
        return 0

    def update(self, Doc tokens, GoldParse gold, itn=0):
-        """Update the statistical model.
+        """
+        Update the statistical model.

        Arguments:
            doc (Doc):
@ -334,15 +342,17 @@ cdef class Parser:
        self.moves.finalize_state(stcls.c)
        return loss

-    def step_through(self, Doc doc):
-        """Set up a stepwise state, to introspect and control the transition sequence.
+    def step_through(self, Doc doc, GoldParse gold=None):
+        """
+        Set up a stepwise state, to introspect and control the transition sequence.

        Arguments:
            doc (Doc): The document to step through.
+            gold (GoldParse): Optional gold parse
        Returns (StepwiseState):
            A state object, to step through the annotation process.
        """
-        return StepwiseState(self, doc)
+        return StepwiseState(self, doc, gold=gold)

    def from_transition_sequence(self, Doc doc, sequence):
        """Control the annotations on a document by specifying a transition sequence
@ -360,18 +370,28 @@ cdef class Parser:
    def add_label(self, label):
        # Doesn't set label into serializer -- subclasses override it to do that.
        for action in self.moves.action_types:
-            self.moves.add_action(action, label)
+            added = self.moves.add_action(action, label)
+            if added:
+                # Important that the labels be stored as a list! We need the
+                # order, or the model goes out of synch
+                self.cfg.setdefault('extra_labels', []).append(label)


 cdef class StepwiseState:
    cdef readonly StateClass stcls
    cdef readonly Example eg
    cdef readonly Doc doc
+    cdef readonly GoldParse gold
    cdef readonly Parser parser

-    def __init__(self, Parser parser, Doc doc):
+    def __init__(self, Parser parser, Doc doc, GoldParse gold=None):
        self.parser = parser
        self.doc = doc
+        if gold is not None:
+            self.gold = gold
+            self.parser.moves.preprocess_gold(self.gold)
+        else:
+            self.gold = GoldParse(doc)
        self.stcls = StateClass.init(doc.c, doc.length)
        self.parser.moves.initialize_state(self.stcls.c)
        self.eg = Example(
@ -406,6 +426,24 @@ cdef class StepwiseState:
        return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
                for i in range(self.stcls.c.length)]

+    @property
+    def costs(self):
+        """
+        Find the action-costs for the current state.
+        """
+        if not self.gold:
+            raise ValueError("Can't set costs: No GoldParse provided")
+        self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
+                self.stcls, self.gold)
+        costs = {}
+        for i in range(self.parser.moves.n_moves):
+            if not self.eg.c.is_valid[i]:
+                continue
+            transition = self.parser.moves.c[i]
+            name = self.parser.moves.move_name(transition.move, transition.label)
+            costs[name] = self.eg.c.costs[i]
+        return costs
+
    def predict(self):
        self.eg.reset()
        self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -1,5 +1,9 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t
+
 from ..vocab cimport EMPTY_LEXEME
 from ..structs cimport Entity
 from ..lexeme cimport Lexeme
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -1,4 +1,8 @@
 # cython: infer_types=True
+# coding: utf-8
+from __future__ import unicode_literals
+
+from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
 from collections import defaultdict
@ -6,7 +10,6 @@ from collections import defaultdict
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF


 cdef weight_t MIN_SCORE = -90000
@ -32,7 +35,7 @@ cdef class TransitionSystem:
        self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))

        for action, label_strs in sorted(labels_by_action.items()):
-            for label_str in sorted(label_strs):
+            for label_str in label_strs:
                self.add_action(int(action), label_str)
        self.root_label = self.strings['ROOT']
        self.freqs = {} if _freqs is None else _freqs
--- a/spacy/syntax/util.py
+++ b/spacy/syntax/util.py
@ -1,18 +0,0 @@
-from os import path
-import json
-
-class Config(object):
-    def __init__(self, **kwargs):
-        for key, value in kwargs.items():
-            setattr(self, key, value)
-
-    def get(self, attr, default=None):
-        return self.__dict__.get(attr, default)
-
-    @classmethod
-    def write(cls, model_dir, name, **kwargs):
-        open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs))
-
-    @classmethod
-    def read(cls, model_dir, name):
-        return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,5 +1,7 @@
-import json
-import pathlib
+# coding: utf8
+from __future__ import unicode_literals
+
+import ujson
 from collections import defaultdict

 from cymem.cymem cimport Pool
@ -12,8 +14,8 @@ from thinc.linalg cimport VecVec
 from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .gold cimport GoldParse
-
 from .attrs cimport *
+from . import util


 cpdef enum:
@ -106,10 +108,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:


 cdef class Tagger:
-    """Annotate part-of-speech tags on Doc objects."""
+    """
+    Annotate part-of-speech tags on Doc objects.
+    """
    @classmethod
    def load(cls, path, vocab, require=False):
-        """Load the statistical model from the supplied path.
+        """
+        Load the statistical model from the supplied path.

        Arguments:
            path (Path):
@ -123,10 +128,10 @@ cdef class Tagger:
        """
        # TODO: Change this to expect config.json when we don't have to
        # support old data.
-        path = path if not isinstance(path, basestring) else pathlib.Path(path)
+        path = util.ensure_path(path)
        if (path / 'templates.json').exists():
            with (path / 'templates.json').open('r', encoding='utf8') as file_:
-                templates = json.load(file_)
+                templates = ujson.load(file_)
        elif require:
            raise IOError(
                "Required file %s/templates.json not found when loading Tagger" % str(path))
@ -142,7 +147,8 @@ cdef class Tagger:
        return self

    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
-        """Create a Tagger.
+        """
+        Create a Tagger.

        Arguments:
            vocab (Vocab):
@ -180,7 +186,8 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length

    def __call__(self, Doc tokens):
-        """Apply the tagger, setting the POS tags onto the Doc object.
+        """
+        Apply the tagger, setting the POS tags onto the Doc object.

        Arguments:
            doc (Doc): The tokens to be tagged.
@ -208,7 +215,8 @@ cdef class Tagger:
        tokens._py_tokens = [None] * tokens.length

    def pipe(self, stream, batch_size=1000, n_threads=2):
-        """Tag a stream of documents.
+        """
+        Tag a stream of documents.

        Arguments:
            stream: The sequence of documents to tag.
@ -225,7 +233,8 @@ cdef class Tagger:
            yield doc

    def update(self, Doc tokens, GoldParse gold, itn=0):
-        """Update the statistical model, with tags supplied for the given document.
+        """
+        Update the statistical model, with tags supplied for the given document.

        Arguments:
            doc (Doc):
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -1,17 +1,11 @@
 # cython: embedsignature=True
+# coding: utf8
 from __future__ import unicode_literals

-import pathlib
+import ujson

 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap

@ -23,11 +17,14 @@ from .tokens.doc cimport Doc


 cdef class Tokenizer:
-    """Segment text, and create Doc objects with the discovered segment boundaries."""
+    """
+    Segment text, and create Doc objects with the discovered segment boundaries.
+    """
    @classmethod
    def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
             infix_finditer=None, token_match=None):
-        '''Load a Tokenizer, reading unsupplied components from the path.
+        """
+        Load a Tokenizer, reading unsupplied components from the path.

        Arguments:
            path (Path):
@ -45,13 +42,11 @@ cdef class Tokenizer:
            infix_finditer:
                Signature of re.compile(string).finditer
        Returns Tokenizer
-        '''
-        if isinstance(path, basestring):
-            path = pathlib.Path(path)
-
+        """
+        path = util.ensure_path(path)
        if rules is None:
            with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
-                rules = json.load(file_)
+                rules = ujson.load(file_)
        if prefix_search in (None, True):
            with (path / 'tokenizer' / 'prefix.txt').open() as file_:
                entries = file_.read().split('\n')
@ -67,7 +62,8 @@ cdef class Tokenizer:
        return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)

    def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
-        '''Create a Tokenizer, to create Doc objects given unicode text.
+        """
+        Create a Tokenizer, to create Doc objects given unicode text.

        Arguments:
            vocab (Vocab):
@ -85,7 +81,7 @@ cdef class Tokenizer:
                to find infixes.
            token_match:
                A boolean function matching strings that becomes tokens.
-        '''
+        """
        self.mem = Pool()
        self._cache = PreshMap()
        self._specials = PreshMap()
@ -117,7 +113,8 @@ cdef class Tokenizer:

    @cython.boundscheck(False)
    def __call__(self, unicode string):
-        """Tokenize a string.
+        """
+        Tokenize a string.

        Arguments:
            string (unicode): The string to tokenize.
@ -170,7 +167,8 @@ cdef class Tokenizer:
        return tokens

    def pipe(self, texts, batch_size=1000, n_threads=2):
-        """Tokenize a stream of texts.
+        """
+        Tokenize a stream of texts.

        Arguments:
            texts: A sequence of unicode texts.
@ -324,7 +322,8 @@ cdef class Tokenizer:
        self._cache.set(key, cached)

    def find_infix(self, unicode string):
-        """Find internal split points of the string, such as hyphens.
+        """
+        Find internal split points of the string, such as hyphens.

        string (unicode): The string to segment.

@ -337,7 +336,8 @@ cdef class Tokenizer:
        return list(self.infix_finditer(string))

    def find_prefix(self, unicode string):
-        """Find the length of a prefix that should be segmented from the string,
+        """
+        Find the length of a prefix that should be segmented from the string,
        or None if no prefix rules match.

        Arguments:
@ -350,7 +350,8 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0

    def find_suffix(self, unicode string):
-        """Find the length of a suffix that should be segmented from the string,
+        """
+        Find the length of a suffix that should be segmented from the string,
        or None if no suffix rules match.

        Arguments:
@ -363,13 +364,15 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0

    def _load_special_tokenization(self, special_cases):
-        '''Add special-case tokenization rules.
-        '''
+        """
+        Add special-case tokenization rules.
+        """
        for chunk, substrings in sorted(special_cases.items()):
            self.add_special_case(chunk, substrings)

    def add_special_case(self, unicode string, substrings):
-        '''Add a special-case tokenization rule.
+        """
+        Add a special-case tokenization rule.

        Arguments:
            string (unicode): The string to specially tokenize.
@ -378,7 +381,7 @@ cdef class Tokenizer:
                attributes. The ORTH fields of the attributes must exactly match
                the string when they are concatenated.
        Returns None
-        '''
+        """
        substrings = list(substrings)
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
        cached.length = len(substrings)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1,15 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 cimport cython
+cimport numpy as np
+import numpy
+import numpy.linalg
+import struct
+
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t
 from libc.math cimport sqrt

-import numpy
-import numpy.linalg
-import struct
-cimport numpy as np
-import six
-import warnings
-
+from .span cimport Span
+from .token cimport Token
 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
-from .span cimport Span
-from .token cimport Token
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
+from ..compat import is_config


 DEF PADDING = 5
@ -76,7 +78,7 @@ cdef class Doc:

    """
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
-        '''
+        """
        Create a Doc object.

        Aside: Implementation
@ -97,7 +99,7 @@ cdef class Doc:
                A list of boolean values, of the same length as words. True
                means that the word is followed by a space, False means it is not.
                If None, defaults to [True]*len(words)
-        '''
+        """
        self.vocab = vocab
        size = 20
        self.mem = Pool()
@ -158,7 +160,7 @@ cdef class Doc:
            self.is_parsed = True

    def __getitem__(self, object i):
-        '''
+        """
        doc[i]
            Get the Token object at position i, where i is an integer.
            Negative indexing is supported, and follows the usual Python
@ -172,7 +174,7 @@ cdef class Doc:
            are not supported, as `Span` objects must be contiguous (cannot have gaps).
            You can use negative indices and open-ended ranges, which have their
            normal Python semantics.
-        '''
+        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
            return Span(self, start, stop, label=0)
@ -186,7 +188,7 @@ cdef class Doc:
            return Token.cinit(self.vocab, &self.c[i], i, self)

    def __iter__(self):
-        '''
+        """
        for token in doc
            Iterate over `Token`  objects, from which the annotations can
            be easily accessed. This is the main way of accessing Token
@ -194,7 +196,7 @@ cdef class Doc:
            Python. If faster-than-Python speeds are required, you can
            instead access the annotations as a numpy array, or access the
            underlying C data directly from Cython.
-        '''
+        """
        cdef int i
        for i in range(self.length):
            if self._py_tokens[i] is not None:
@ -203,10 +205,10 @@ cdef class Doc:
                yield Token.cinit(self.vocab, &self.c[i], i, self)

    def __len__(self):
-        '''
+        """
        len(doc)
            The number of tokens in the document.
-        '''
+        """
        return self.length

    def __unicode__(self):
@ -216,7 +218,7 @@ cdef class Doc:
        return u''.join([t.text_with_ws for t in self]).encode('utf-8')

    def __str__(self):
-        if six.PY3:
+        if is_config(python3=True):
            return self.__unicode__()
        return self.__bytes__()

@ -228,7 +230,8 @@ cdef class Doc:
        return self

    def similarity(self, other):
-        '''Make a semantic similarity estimate. The default estimate is cosine
+        """
+        Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        Arguments:
@ -237,7 +240,7 @@ cdef class Doc:

        Return:
            score (float): A scalar similarity score. Higher is more similar.
-        '''
+        """
        if 'similarity' in self.user_hooks:
            return self.user_hooks['similarity'](self, other)
        if self.vector_norm == 0 or other.vector_norm == 0:
@ -245,9 +248,9 @@ cdef class Doc:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property has_vector:
-        '''
+        """
        A boolean value indicating whether a word vector is associated with the object.
-        '''
+        """
        def __get__(self):
            if 'has_vector' in self.user_hooks:
                return self.user_hooks['has_vector'](self)
@ -255,11 +258,11 @@ cdef class Doc:
            return any(token.has_vector for token in self)

    property vector:
-        '''
+        """
        A real-valued meaning representation. Defaults to an average of the token vectors.

        Type: numpy.ndarray[ndim=1, dtype='float32']
-        '''
+        """
        def __get__(self):
            if 'vector' in self.user_hooks:
                return self.user_hooks['vector'](self)
@ -294,17 +297,21 @@ cdef class Doc:
        return self.text

    property text:
-        '''A unicode representation of the document text.'''
+        """
+        A unicode representation of the document text.
+        """
        def __get__(self):
            return u''.join(t.text_with_ws for t in self)

    property text_with_ws:
-        '''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
+        """
+        An alias of Doc.text, provided for duck-type compatibility with Span and Token.
+        """
        def __get__(self):
            return self.text

    property ents:
-        '''
+        """
        Yields named-entity `Span` objects, if the entity recognizer
        has been applied to the document. Iterate over the span to get
        individual Token objects, or access the label:
@ -318,7 +325,7 @@ cdef class Doc:
            assert ents[0].label_ == 'PERSON'
            assert ents[0].orth_ == 'Best'
            assert ents[0].text == 'Mr. Best'
-        '''
+        """
        def __get__(self):
            cdef int i
            cdef const TokenC* token
@ -382,13 +389,13 @@ cdef class Doc:
                    self.c[start].ent_iob = 3

    property noun_chunks:
-        '''
+        """
        Yields base noun-phrase #[code Span] objects, if the document
        has been syntactically parsed. A base noun phrase, or
        'NP chunk', is a noun phrase that does not permit other NPs to
        be nested within it – so no NP-level coordination, no prepositional
-        phrases, and no relative clauses. For example:
-        '''
+        phrases, and no relative clauses.
+        """
        def __get__(self):
            if not self.is_parsed:
                raise ValueError(
@ -496,7 +503,8 @@ cdef class Doc:
        return output

    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
-        """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
+        """
+        Produce a dict of {attribute (int): count (ints)} frequencies, keyed
        by the values of the given attribute ID.

        Example:
@ -563,8 +571,9 @@ cdef class Doc:
            self.c[i] = parsed[i]

    def from_array(self, attrs, array):
-        '''Write to a `Doc` object, from an `(M, N)` array of attributes.
-        '''
+        """
+        Write to a `Doc` object, from an `(M, N)` array of attributes.
+        """
        cdef int i, col
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
@ -603,19 +612,23 @@ cdef class Doc:
        return self

    def to_bytes(self):
-        '''Serialize, producing a byte string.'''
+        """
+        Serialize, producing a byte string.
+        """
        byte_string = self.vocab.serializer.pack(self)
        cdef uint32_t length = len(byte_string)
        return struct.pack('I', length) + byte_string

    def from_bytes(self, data):
-        '''Deserialize, loading from bytes.'''
+        """
+        Deserialize, loading from bytes.
+        """
        self.vocab.serializer.unpack_into(data[4:], self)
        return self

    @staticmethod
    def read_bytes(file_):
-        '''
+        """
        A static method, used to read serialized #[code Doc] objects from
        a file. For example:

@ -630,7 +643,7 @@ cdef class Doc:
                for byte_string in Doc.read_bytes(file_):
                    docs.append(Doc(nlp.vocab).from_bytes(byte_string))
            assert len(docs) == 2
-        '''
+        """
        keep_reading = True
        while keep_reading:
            try:
@ -644,7 +657,8 @@ cdef class Doc:
            yield n_bytes_str + data

    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """Retokenize the document, such that the span at doc.text[start_idx : end_idx]
+        """
+        Retokenize the document, such that the span at doc.text[start_idx : end_idx]
        is merged into a single token. If start_idx and end_idx do not mark start
        and end token boundaries, the document remains unchanged.

@ -658,7 +672,6 @@ cdef class Doc:
            token (Token):
                The newly merged token, or None if the start and end indices did
                not fall at token boundaries.
-
        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -1,26 +1,31 @@
+# coding: utf8
 from __future__ import unicode_literals
 from collections import defaultdict
+
+cimport numpy as np
 import numpy
 import numpy.linalg
-cimport numpy as np
 from libc.math cimport sqrt
-import six

+from .doc cimport token_by_start, token_by_end
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..util import normalize_slice
-from .doc cimport token_by_start, token_by_end
 from ..attrs cimport IS_PUNCT, IS_SPACE
 from ..lexeme cimport Lexeme
+from ..compat import is_config


 cdef class Span:
-    """A slice from a Doc object."""
+    """
+    A slice from a Doc object.
+    """
    def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
                  vector_norm=None):
-        '''Create a Span object from the slice doc[start : end]
+        """
+        Create a Span object from the slice doc[start : end]

        Arguments:
            doc (Doc): The parent document.
@ -30,7 +35,7 @@ cdef class Span:
            vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
        Returns:
            Span The newly constructed object.
-        '''
+        """
        if not (0 <= start <= end <= len(doc)):
            raise IndexError

@ -68,7 +73,7 @@ cdef class Span:
        return self.end - self.start

    def __repr__(self):
-        if six.PY3:
+        if is_config(python3=True):
            return self.text
        return self.text.encode('utf-8')

@ -89,7 +94,8 @@ cdef class Span:
            yield self.doc[i]

    def merge(self, *args, **attributes):
-        """Retokenize the document, such that the span is merged into a single token.
+        """
+        Retokenize the document, such that the span is merged into a single token.

        Arguments:
            **attributes:
@ -102,7 +108,8 @@ cdef class Span:
        return self.doc.merge(self.start_char, self.end_char, *args, **attributes)

    def similarity(self, other):
-        '''Make a semantic similarity estimate. The default estimate is cosine
+        """
+        Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        Arguments:
@ -111,7 +118,7 @@ cdef class Span:

        Return:
            score (float): A scalar similarity score. Higher is more similar.
-        '''
+        """
        if 'similarity' in self.doc.user_span_hooks:
            self.doc.user_span_hooks['similarity'](self, other)
        if self.vector_norm == 0.0 or other.vector_norm == 0.0:
@ -133,11 +140,12 @@ cdef class Span:
            self.end = end + 1

    property sent:
-        '''The sentence span that this span is a part of.
+        """
+        The sentence span that this span is a part of.

        Returns:
            Span The sentence this is part of.
-        '''
+        """
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
@ -198,13 +206,13 @@ cdef class Span:
            return u''.join([t.text_with_ws for t in self])

    property noun_chunks:
-        '''
+        """
        Yields base noun-phrase #[code Span] objects, if the document
        has been syntactically parsed. A base noun phrase, or
        'NP chunk', is a noun phrase that does not permit other NPs to
        be nested within it – so no NP-level coordination, no prepositional
        phrases, and no relative clauses. For example:
-        '''
+        """
        def __get__(self):
            if not self.doc.is_parsed:
                raise ValueError(
@ -223,17 +231,16 @@ cdef class Span:
                yield span

    property root:
-        """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
+        """
+        The token within the span that's highest in the parse tree. If there's a
+        tie, the earlist is prefered.

        Returns:
            Token: The root token.

-        i.e. has the
-        shortest path to the root of the sentence (or is the root itself).
-
-        If multiple words are equally high in the tree, the first word is taken.
-
-        For example:
+        i.e. has the shortest path to the root of the sentence (or is the root
+        itself). If multiple words are equally high in the tree, the first word
+        is taken. For example:

        >>> toks = nlp(u'I like New York in Autumn.')

@ -303,7 +310,8 @@ cdef class Span:
                return self.doc[root]

    property lefts:
-        """Tokens that are to the left of the span, whose head is within the Span.
+        """
+        Tokens that are to the left of the span, whose head is within the Span.

        Yields: Token A left-child of a token of the span.
        """
@ -314,7 +322,8 @@ cdef class Span:
                        yield left

    property rights:
-        """Tokens that are to the right of the Span, whose head is within the Span.
+        """
+        Tokens that are to the right of the Span, whose head is within the Span.

        Yields: Token A right-child of a token of the span.
        """
@ -325,7 +334,8 @@ cdef class Span:
                        yield right

    property subtree:
-        """Tokens that descend from tokens in the span, but fall outside it.
+        """
+        Tokens that descend from tokens in the span, but fall outside it.

        Yields: Token A descendant of a token within the span.
        """
@ -337,7 +347,9 @@ cdef class Span:
                yield from word.subtree

    property ent_id:
-        '''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.root.ent_id

@ -345,9 +357,11 @@ cdef class Span:
            # TODO
            raise NotImplementedError(
                "Can't yet set ent_id from Span. Vote for this feature on the issue "
-                "tracker: http://github.com/spacy-io/spaCy")
+                "tracker: http://github.com/explosion/spaCy/issues")
    property ent_id_:
-        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.root.ent_id_

@ -355,7 +369,7 @@ cdef class Span:
            # TODO
            raise NotImplementedError(
                "Can't yet set ent_id_ from Span. Vote for this feature on the issue "
-                "tracker: http://github.com/spacy-io/spaCy")
+                "tracker: http://github.com/explosion/spaCy/issues")

    property orth_:
        def __get__(self):
@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
            raise RuntimeError(
                "Array bounds exceeded while searching for root word. This likely "
                "means the parse tree is in an invalid state. Please report this "
-                "issue here: http://github.com/honnibal/spaCy/")
+                "issue here: http://github.com/explosion/spaCy/issues")
    return n
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -1,5 +1,5 @@
-# coding: utf8
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals

 from libc.string cimport memcpy
@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from cython.view cimport array as cvarray
 cimport numpy as np
 np.import_array()
-
 import numpy
-import six
-

 from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from .. import parts_of_speech
-
 from ..attrs cimport LEMMA
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CCONJ, PUNCT
-
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET
 from ..attrs cimport IS_QUOTE
@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT
 from ..attrs cimport IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
-
 from ..lexeme cimport Lexeme
+from ..compat import is_config


 cdef class Token:
-    """An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
+    """
+    An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
    """
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        self.vocab = vocab
@ -46,7 +42,9 @@ cdef class Token:
        return hash((self.doc, self.i))

    def __len__(self):
-        '''Number of unicode characters in token.text'''
+        """
+        Number of unicode characters in token.text.
+        """
        return self.c.lex.length

    def __unicode__(self):
@ -56,7 +54,7 @@ cdef class Token:
        return self.text.encode('utf8')

    def __str__(self):
-        if six.PY3:
+        if is_config(python3=True):
            return self.__unicode__()
        return self.__bytes__()

@ -83,27 +81,30 @@ cdef class Token:
            raise ValueError(op)

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        '''Check the value of a boolean flag.
+        """
+        Check the value of a boolean flag.

        Arguments:
            flag_id (int): The ID of the flag attribute.
        Returns:
            is_set (bool): Whether the flag is set.
-        '''
+        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
-        '''Get a neighboring token.
+        """
+        Get a neighboring token.

        Arguments:
            i (int): The relative position of the token to get. Defaults to 1.
        Returns:
            neighbor (Token): The token at position self.doc[self.i+i]
-        '''
+        """
        return self.doc[self.i+i]

    def similarity(self, other):
-        '''Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        """
+        Compute a semantic similarity estimate. Defaults to cosine over vectors.

        Arguments:
            other:
@ -111,7 +112,7 @@ cdef class Token:
                Token and Lexeme objects.
        Returns:
            score (float): A scalar similarity score. Higher is more similar.
-        '''
+        """
        if 'similarity' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['similarity'](self)
        if self.vector_norm == 0 or other.vector_norm == 0:
@ -209,9 +210,9 @@ cdef class Token:
            self.c.dep = label

    property has_vector:
-        '''
+        """
        A boolean value indicating whether a word vector is associated with the object.
-        '''
+        """
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['has_vector'](self)
@ -223,11 +224,11 @@ cdef class Token:
                return False

    property vector:
-        '''
+        """
        A real-valued meaning representation.

        Type: numpy.ndarray[ndim=1, dtype='float32']
-        '''
+        """
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector'](self)
@ -245,6 +246,7 @@ cdef class Token:
    property repvec:
        def __get__(self):
            raise AttributeError("repvec was renamed to vector in v0.100")
+
    property has_repvec:
        def __get__(self):
            raise AttributeError("has_repvec was renamed to has_vector in v0.100")
@ -265,7 +267,8 @@ cdef class Token:

    property lefts:
        def __get__(self):
-            """The leftward immediate children of the word, in the syntactic
+            """
+            The leftward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef int nr_iter = 0
@ -282,8 +285,10 @@ cdef class Token:

    property rights:
        def __get__(self):
-            """The rightward immediate children of the word, in the syntactic
-            dependency parse."""
+            """
+            The rightward immediate children of the word, in the syntactic
+            dependency parse.
+            """
            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
            tokens = []
            cdef int nr_iter = 0
@ -300,19 +305,21 @@ cdef class Token:
                yield t

    property children:
-        '''A sequence of the token's immediate syntactic children.
+        """
+        A sequence of the token's immediate syntactic children.

        Yields: Token A child token such that child.head==self
-        '''
+        """
        def __get__(self):
            yield from self.lefts
            yield from self.rights

    property subtree:
-        '''A sequence of all the token's syntactic descendents.
+        """
+        A sequence of all the token's syntactic descendents.

        Yields: Token A descendent token such that self.is_ancestor(descendent)
-        '''
+        """
        def __get__(self):
            for word in self.lefts:
                yield from word.subtree
@ -321,26 +328,29 @@ cdef class Token:
                yield from word.subtree

    property left_edge:
-        '''The leftmost token of this token's syntactic descendents.
+        """
+        The leftmost token of this token's syntactic descendents.

        Returns: Token The first token such that self.is_ancestor(token)
-        '''
+        """
        def __get__(self):
            return self.doc[self.c.l_edge]

    property right_edge:
-        '''The rightmost token of this token's syntactic descendents.
+        """
+        The rightmost token of this token's syntactic descendents.

        Returns: Token The last token such that self.is_ancestor(token)
-        '''
+        """
        def __get__(self):
            return self.doc[self.c.r_edge]

    property ancestors:
-        '''A sequence of this token's syntactic ancestors.
+        """
+        A sequence of this token's syntactic ancestors.

        Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
-        '''
+        """
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
            # guard against infinite loop, no token can have
@ -356,25 +366,29 @@ cdef class Token:
        return self.is_ancestor(descendant)

    def is_ancestor(self, descendant):
-        '''Check whether this token is a parent, grandparent, etc. of another
+        """
+        Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.

        Arguments:
            descendant (Token): Another token.
        Returns:
            is_ancestor (bool): Whether this token is the ancestor of the descendant.
-        '''
+        """
        if self.doc is not descendant.doc:
            return False
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )

    property head:
-        '''The syntactic parent, or "governor", of this token.
+        """
+        The syntactic parent, or "governor", of this token.

        Returns: Token
-        '''
+        """
        def __get__(self):
-            """The token predicted by the parser to be the head of the current token."""
+            """
+            The token predicted by the parser to be the head of the current token.
+            """
            return self.doc[self.i + self.c.head]
        def __set__(self, Token new_head):
            # this function sets the head of self to new_head
@ -467,10 +481,11 @@ cdef class Token:
            self.c.head = rel_newhead_i

    property conjuncts:
-        '''A sequence of coordinated tokens, including the token itself.
+        """
+        A sequence of coordinated tokens, including the token itself.

        Yields: Token A coordinated token
-        '''
+        """
        def __get__(self):
            """Get a list of conjoined words."""
            cdef Token word
@ -501,7 +516,9 @@ cdef class Token:
            return iob_strings[self.c.ent_iob]

    property ent_id:
-        '''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.c.ent_id

@ -509,7 +526,9 @@ cdef class Token:
            self.c.ent_id = key

    property ent_id_:
-        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]

--- a/spacy/train.py
+++ b/spacy/train.py
@ -1,15 +1,16 @@
-from __future__ import absolute_import
-from __future__ import unicode_literals
+# coding: utf8
+from __future__ import absolute_import, unicode_literals

 import random
 import tqdm
-from .gold import GoldParse
+from .gold import GoldParse, merge_sents
 from .scorer import Scorer
-from .gold import merge_sents


 class Trainer(object):
-    '''Manage training of an NLP pipeline.'''
+    """
+    Manage training of an NLP pipeline.
+    """
    def __init__(self, nlp, gold_tuples):
        self.nlp = nlp
        self.gold_tuples = gold_tuples
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,29 +1,18 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function
-import os
+
 import io
-import json
+import ujson
 import re
-import os.path
-import pathlib
+from pathlib import Path
 import sys
 import textwrap

-
-try:
-    basestring
-except NameError:
-    basestring = str
-
-
-try:
-    raw_input
-except NameError: # Python 3
-    raw_input = input
+from .compat import basestring_, unicode_, input_


 LANGUAGES = {}
-_data_path = pathlib.Path(__file__).parent / 'data'
+_data_path = Path(__file__).parent / 'data'


 def set_lang_class(name, cls):
@ -47,9 +36,14 @@ def get_data_path(require_exists=True):

 def set_data_path(path):
    global _data_path
-    if isinstance(path, basestring):
-        path = pathlib.Path(path)
-    _data_path = path
+    _data_path = ensure_path(path)
+
+
+def ensure_path(path):
+    if isinstance(path, basestring_):
+        return Path(path)
+    else:
+        return path


 def or_(val1, val2):
@ -61,41 +55,8 @@ def or_(val1, val2):
        return val2


-def match_best_version(target_name, target_version, path):
-    path = path if not isinstance(path, basestring) else pathlib.Path(path)
-    if path is None or not path.exists():
-        return None
-    matches = []
-    for data_name in path.iterdir():
-        name, version = split_data_name(data_name.parts[-1])
-        if name == target_name and constraint_match(target_version, version):
-            matches.append((tuple(float(v) for v in version.split('.')), data_name))
-    if matches:
-        return pathlib.Path(max(matches)[1])
-    else:
-        return None
-
-
-def split_data_name(name):
-    return name.split('-', 1) if '-' in name else (name, '')
-
-
-def constraint_match(constraint_string, version):
-    # From http://github.com/spacy-io/sputnik
-    if not constraint_string:
-        return True
-
-    constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]
-
-    for c in constraints:
-        if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
-            raise ValueError('invalid constraint: %s' % c)
-
-    return all(semver.match(version, c) for c in constraints)
-
-
 def read_regex(path):
-    path = path if not isinstance(path, basestring) else pathlib.Path(path)
+    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
@ -152,21 +113,11 @@ def check_renamed_kwargs(renamed, kwargs):
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))


-def is_windows():
-    """Check if user is on Windows."""
-    return sys.platform.startswith('win')
-
-
-def is_python2():
-    """Check if Python 2 is used."""
-    return sys.version.startswith('2.')
-
-
 def parse_package_meta(package_path, package, require=True):
-    location = os.path.join(str(package_path), package, 'meta.json')
-    if os.path.isfile(location):
-        with io.open(location, encoding='utf8') as f:
-            meta = json.load(f)
+    location = package_path / package / 'meta.json'
+    if location.is_file():
+        with location.open('r', encoding='utf8') as f:
+            meta = ujson.load(f)
            return meta
    elif require:
        raise IOError("Could not read meta.json from %s" % location)
@ -181,7 +132,7 @@ def get_raw_input(description, default=False):

    additional = ' (default: {d})'.format(d=default) if default else ''
    prompt = '    {d}{a}: '.format(d=description, a=additional)
-    user_input = raw_input(prompt)
+    user_input = input_(prompt)
    return user_input


@ -209,10 +160,9 @@ def print_markdown(data, **kwargs):
    which will be converted to a list of tuples."""

    def excl_value(value):
-        # don't print value if it contains absolute path of directory
-        # (i.e. personal info that shouldn't need to be shared)
-        # other conditions can be included here if necessary
-        if str(pathlib.Path(__file__).parent) in value:
+        # don't print value if it contains absolute path of directory (i.e.
+        # personal info). Other conditions can be included here if necessary.
+        if unicode_(Path(__file__).parent) in value:
            return True

    if type(data) == dict:
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,41 +1,29 @@
+# coding: utf8
 from __future__ import unicode_literals

+import bz2
+import ujson
+import re
+
 from libc.string cimport memset
 from libc.stdint cimport int32_t
 from libc.math cimport sqrt
-
-from pathlib import Path
-import bz2
-import ujson as json
-import re
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
+from cymem.cymem cimport Address
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .typedefs cimport attr_t
 from .cfile cimport CFile, StringCFile
-from .lemmatizer import Lemmatizer
-from .attrs import intify_attrs
 from .tokens.token cimport Token
-
-from . import attrs
-from . import symbols
-
-from cymem.cymem cimport Address
 from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG
+
+from .compat import copy_reg, pickle
+from .lemmatizer import Lemmatizer
+from .attrs import intify_attrs
 from . import util
-
-
-try:
-    import copy_reg
-except ImportError:
-    import copyreg as copy_reg
+from . import attrs
+from . import symbols


 DEF MAX_VEC_SIZE = 100000
@ -48,8 +36,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC


 cdef class Vocab:
-    '''A map container for a language's LexemeC structs.
-    '''
+    """
+    A map container for a language's LexemeC structs.
+    """
    @classmethod
    def load(cls, path, lex_attr_getters=None, lemmatizer=True,
             tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
@ -72,8 +61,7 @@ cdef class Vocab:
        Returns:
            Vocab: The newly constructed vocab object.
        """
-        if isinstance(path, basestring):
-            path = Path(path)
+        path = util.ensure_path(path)
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
        if 'vectors' in deprecated_kwargs:
            raise AttributeError(
@ -81,7 +69,7 @@ cdef class Vocab:
                "Install vectors after loading.")
        if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
            with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
-                tag_map = json.load(file_)
+                tag_map = ujson.load(file_)
        elif tag_map is True:
            tag_map = None
        if lex_attr_getters is not None \
@ -94,12 +82,12 @@ cdef class Vocab:
            lemmatizer = Lemmatizer.load(path)
        if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
            with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
-                serializer_freqs = json.load(file_)
+                serializer_freqs = ujson.load(file_)
        else:
            serializer_freqs = None

        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
-            strings_list = json.load(file_)
+            strings_list = ujson.load(file_)
        cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
                              lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
                              strings=strings_list)
@ -108,7 +96,8 @@ cdef class Vocab:

    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
            serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
-        '''Create the vocabulary.
+        """
+        Create the vocabulary.

        lex_attr_getters (dict):
            A dictionary mapping attribute IDs to functions to compute them.
@ -123,7 +112,7 @@ cdef class Vocab:

        Returns:
            Vocab: The newly constructed vocab object.
-        '''
+        """
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)

        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@ -172,17 +161,19 @@ cdef class Vocab:
            return langfunc('_') if langfunc else ''

    def __len__(self):
-        """The current number of lexemes stored."""
+        """
+        The current number of lexemes stored.
+        """
        return self.length

    def resize_vectors(self, int new_size):
-        '''
+        """
        Set vectors_length to a new size, and allocate more memory for the Lexeme
        vectors if necessary. The memory will be zeroed.

        Arguments:
            new_size (int): The new size of the vectors.
-        '''
+        """
        cdef hash_t key
        cdef size_t addr
        if new_size > self.vectors_length:
@ -193,7 +184,8 @@ cdef class Vocab:
        self.vectors_length = new_size

    def add_flag(self, flag_getter, int flag_id=-1):
-        '''Set a new boolean flag to words in the vocabulary.
+        """
+        Set a new boolean flag to words in the vocabulary.

        The flag_setter function will be called over the words currently in the
        vocab, and then applied to new words as they occur. You'll then be able
@ -213,7 +205,7 @@ cdef class Vocab:

        Returns:
            flag_id (int): The integer ID by which the flag value can be checked.
-        '''
+        """
        if flag_id == -1:
            for bit in range(1, 64):
                if bit not in self.lex_attr_getters:
@ -234,9 +226,11 @@ cdef class Vocab:
        return flag_id

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
-        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
+        """
+        Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
-        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
+        is the lexicon's own memory, the lexeme is saved in the lexicon.
+        """
        if string == u'':
            return &EMPTY_LEXEME
        cdef LexemeC* lex
@ -252,9 +246,11 @@ cdef class Vocab:
            return self._new_lexeme(mem, string)

    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
-        '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
+        """
+        Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
-        is the lexicon's own memory, the lexeme is saved in the lexicon.'''
+        is the lexicon's own memory, the lexeme is saved in the lexicon.
+        """
        if orth == 0:
            return &EMPTY_LEXEME
        cdef LexemeC* lex
@ -297,30 +293,33 @@ cdef class Vocab:
        self.length += 1

    def __contains__(self, unicode string):
-        '''Check whether the string has an entry in the vocabulary.
+        """
+        Check whether the string has an entry in the vocabulary.

        Arguments:
            string (unicode): The ID string.

        Returns:
            bool Whether the string has an entry in the vocabulary.
-        '''
+        """
        key = hash_string(string)
        lex = self._by_hash.get(key)
        return lex is not NULL

    def __iter__(self):
-        '''Iterate over the lexemes in the vocabulary.
+        """
+        Iterate over the lexemes in the vocabulary.

        Yields: Lexeme An entry in the vocabulary.
-        '''
+        """
        cdef attr_t orth
        cdef size_t addr
        for orth, addr in self._by_orth.items():
            yield Lexeme(self, orth)

    def __getitem__(self,  id_or_string):
-        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
+        """
+        Retrieve a lexeme, given an int ID or a unicode string.  If a previously
        unseen unicode string is given, a new lexeme is created and stored.

        Arguments:
@ -332,7 +331,7 @@ cdef class Vocab:

        Returns:
            lexeme (Lexeme): The lexeme indicated by the given ID.
-        '''
+        """
        cdef attr_t orth
        if type(id_or_string) == unicode:
            orth = self.strings[id_or_string]
@ -355,7 +354,8 @@ cdef class Vocab:
        return tokens

    def dump(self, loc=None):
-        """Save the lexemes binary data to the given location, or
+        """
+        Save the lexemes binary data to the given location, or
        return a byte-string with the data if loc is None.

        Arguments:
@ -392,14 +392,15 @@ cdef class Vocab:
            return fp.string_data()

    def load_lexemes(self, loc):
-        '''Load the binary vocabulary data from the given location.
+        """
+        Load the binary vocabulary data from the given location.

        Arguments:
            loc (Path): The path to load from.

        Returns:
            None
-        '''
+        """
        fp = CFile(loc, 'rb',
                on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
        cdef LexemeC* lexeme = NULL
@ -440,8 +441,9 @@ cdef class Vocab:
        fp.close()

    def _deserialize_lexemes(self, CFile fp):
-        '''Load the binary vocabulary data from the given CFile.
-        '''
+        """
+        Load the binary vocabulary data from the given CFile.
+        """
        cdef LexemeC* lexeme = NULL
        cdef hash_t key
        cdef unicode py_str
@ -494,13 +496,14 @@ cdef class Vocab:
        fp.close()

    def dump_vectors(self, out_loc):
-        '''Save the word vectors to a binary file.
+        """
+        Save the word vectors to a binary file.

        Arguments:
            loc (Path): The path to save to.
        Returns:
            None
-        '''
+        """
        cdef int32_t vec_len = self.vectors_length
        cdef int32_t word_len
        cdef bytes word_str
@ -522,7 +525,8 @@ cdef class Vocab:
        out_file.close()

    def load_vectors(self, file_):
-        """Load vectors from a text-based file.
+        """
+        Load vectors from a text-based file.

        Arguments:
            file_ (buffer): The file to read from. Entries should be separated by newlines,
@ -561,7 +565,8 @@ cdef class Vocab:
        return vec_len

    def load_vectors_from_bin_loc(self, loc):
-        """Load vectors from the location of a binary file.
+        """
+        Load vectors from the location of a binary file.

        Arguments:
            loc (unicode): The path of the binary file to load from.