Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2d, reversing changes made to 92c26a35d4.
2026-03-06 21:01:34 +03:00 · 2018-03-27 19:23:02 +02:00 · 2018-03-27 19:23:02 +02:00 · 1f7229f40f
commit 1f7229f40f
parent 8b7a74570f
67 changed files with 4799 additions and 1040 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -32,7 +32,7 @@ test_script:
  # Note that you must use the environment variable %PYTHON% to refer to
  # the interpreter you're using - Appveyor does not do anything special
  # to put the Python version you want to use on PATH.
-  - "%PYTHON%\\python.exe -m pytest spacy/"
+  - "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"

 after_test:
  # This step builds your wheels.
--- a/.buildkite/train.yml
+++ b/.buildkite/train.yml
@ -0,0 +1,11 @@
+steps:
+  -
+    command: "fab env clean make test wheel"
+    label: ":dizzy: :python:"
+    artifact_paths: "dist/*.whl"
+  - wait
+  - trigger: "spacy-train-from-wheel"
+    label: ":dizzy: :train:"
+    build:
+      env:
+        SPACY_VERSION: "{$SPACY_VERSION}"
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -182,7 +182,7 @@ If you've made a contribution to spaCy, you should fill in the
 [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
 your contribution can be used across the project. If you agree to be bound by
 the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
-and include it with your pull request, or sumit it separately to
+and include it with your pull request, or submit it separately to
 [`.github/contributors/`](/.github/contributors). The name of the file should be
 your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -0,0 +1,392 @@
+'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
+.conllu format for development data, allowing the official scorer to be used.
+'''
+from __future__ import unicode_literals
+import plac
+import tqdm
+import attr
+from pathlib import Path
+import re
+import sys
+import json
+
+import spacy
+import spacy.util
+from spacy.tokens import Token, Doc
+from spacy.gold import GoldParse
+from spacy.syntax.nonproj import projectivize
+from collections import defaultdict, Counter
+from timeit import default_timer as timer
+from spacy.matcher import Matcher
+
+import itertools
+import random
+import numpy.random
+import cytoolz
+
+import conll17_ud_eval
+
+import spacy.lang.zh
+import spacy.lang.ja
+
+spacy.lang.zh.Chinese.Defaults.use_jieba = False
+spacy.lang.ja.Japanese.Defaults.use_janome = False
+
+random.seed(0)
+numpy.random.seed(0)
+
+def minibatch_by_words(items, size=5000):
+    random.shuffle(items)
+    if isinstance(size, int):
+        size_ = itertools.repeat(size)
+    else:
+        size_ = size
+    items = iter(items)
+    while True:
+        batch_size = next(size_)
+        batch = []
+        while batch_size >= 0:
+            try:
+                doc, gold = next(items)
+            except StopIteration:
+                if batch:
+                    yield batch
+                return
+            batch_size -= len(doc)
+            batch.append((doc, gold))
+        if batch:
+            yield batch
+        else:
+            break
+
+################
+# Data reading #
+################
+
+space_re = re.compile('\s+')
+def split_text(text):
+    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
+ 
+
+def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
+              max_doc_length=None, limit=None):
+    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
+    include Doc objects created using nlp.make_doc and then aligned against
+    the gold-standard sequences. If oracle_segments=True, include Doc objects
+    created from the gold-standard segments. At least one must be True.'''
+    if not raw_text and not oracle_segments:
+        raise ValueError("At least one of raw_text or oracle_segments must be True")
+    paragraphs = split_text(text_file.read())
+    conllu = read_conllu(conllu_file)
+    # sd is spacy doc; cd is conllu doc
+    # cs is conllu sent, ct is conllu token
+    docs = []
+    golds = []
+    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
+        sent_annots = []
+        for cs in cd:
+            sent = defaultdict(list)
+            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
+                if '.' in id_:
+                    continue
+                if '-' in id_:
+                    continue
+                id_ = int(id_)-1
+                head = int(head)-1 if head != '0' else id_
+                sent['words'].append(word)
+                sent['tags'].append(tag)
+                sent['heads'].append(head)
+                sent['deps'].append('ROOT' if dep == 'root' else dep)
+                sent['spaces'].append(space_after == '_')
+            sent['entities'] = ['-'] * len(sent['words'])
+            sent['heads'], sent['deps'] = projectivize(sent['heads'],
+                                                       sent['deps'])
+            if oracle_segments:
+                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
+                golds.append(GoldParse(docs[-1], **sent))
+
+            sent_annots.append(sent)
+            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
+                doc, gold = _make_gold(nlp, None, sent_annots)
+                sent_annots = []
+                docs.append(doc)
+                golds.append(gold)
+                if limit and len(docs) >= limit:
+                    return docs, golds
+
+        if raw_text and sent_annots:
+            doc, gold = _make_gold(nlp, None, sent_annots)
+            docs.append(doc)
+            golds.append(gold)
+        if limit and len(docs) >= limit:
+            return docs, golds
+    return docs, golds
+
+
+def read_conllu(file_):
+    docs = []
+    sent = []
+    doc = []
+    for line in file_:
+        if line.startswith('# newdoc'):
+            if doc:
+                docs.append(doc)
+            doc = []
+        elif line.startswith('#'):
+            continue
+        elif not line.strip():
+            if sent:
+                doc.append(sent)
+            sent = []
+        else:
+            sent.append(list(line.strip().split('\t')))
+            if len(sent[-1]) != 10:
+                print(repr(line))
+                raise ValueError
+    if sent:
+        doc.append(sent)
+    if doc:
+        docs.append(doc)
+    return docs
+
+
+def _make_gold(nlp, text, sent_annots):
+    # Flatten the conll annotations, and adjust the head indices
+    flat = defaultdict(list)
+    for sent in sent_annots:
+        flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
+        for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
+            flat[field].extend(sent[field])
+    # Construct text if necessary
+    assert len(flat['words']) == len(flat['spaces'])
+    if text is None:
+        text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) 
+    doc = nlp.make_doc(text)
+    flat.pop('spaces')
+    gold = GoldParse(doc, **flat)
+    return doc, gold
+
+#############################
+# Data transforms for spaCy #
+#############################
+
+def golds_to_gold_tuples(docs, golds):
+    '''Get out the annoying 'tuples' format used by begin_training, given the
+    GoldParse objects.'''
+    tuples = []
+    for doc, gold in zip(docs, golds):
+        text = doc.text
+        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
+        sents = [((ids, words, tags, heads, labels, iob), [])]
+        tuples.append((text, sents))
+    return tuples
+
+
+##############
+# Evaluation #
+##############
+
+def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
+    with text_loc.open('r', encoding='utf8') as text_file:
+        texts = split_text(text_file.read())
+        docs = list(nlp.pipe(texts))
+    with sys_loc.open('w', encoding='utf8') as out_file:
+        write_conllu(docs, out_file)
+    with gold_loc.open('r', encoding='utf8') as gold_file:
+        gold_ud = conll17_ud_eval.load_conllu(gold_file)
+        with sys_loc.open('r', encoding='utf8') as sys_file:
+            sys_ud = conll17_ud_eval.load_conllu(sys_file)
+        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
+    return scores
+
+
+def write_conllu(docs, file_):
+    merger = Matcher(docs[0].vocab)
+    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
+    for i, doc in enumerate(docs):
+        matches = merger(doc)
+        spans = [doc[start:end+1] for _, start, end in matches]
+        offsets = [(span.start_char, span.end_char) for span in spans]
+        for start_char, end_char in offsets:
+            doc.merge(start_char, end_char)
+        file_.write("# newdoc id = {i}\n".format(i=i))
+        for j, sent in enumerate(doc.sents):
+            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
+            file_.write("# text = {text}\n".format(text=sent.text))
+            for k, token in enumerate(sent):
+                file_.write(token._.get_conllu_lines(k) + '\n')
+            file_.write('\n')
+
+
+def print_progress(itn, losses, ud_scores):
+    fields = {
+        'dep_loss': losses.get('parser', 0.0),
+        'tag_loss': losses.get('tagger', 0.0),
+        'words': ud_scores['Words'].f1 * 100,
+        'sents': ud_scores['Sentences'].f1 * 100,
+        'tags': ud_scores['XPOS'].f1 * 100,
+        'uas': ud_scores['UAS'].f1 * 100,
+        'las': ud_scores['LAS'].f1 * 100,
+    }
+    header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
+    if itn == 0:
+        print('\t'.join(header))
+    tpl = '\t'.join((
+        '{:d}',
+        '{dep_loss:.1f}',
+        '{las:.1f}',
+        '{uas:.1f}',
+        '{tags:.1f}',
+        '{sents:.1f}',
+        '{words:.1f}',
+    ))
+    print(tpl.format(itn, **fields))
+
+#def get_sent_conllu(sent, sent_id):
+#    lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
+
+def get_token_conllu(token, i):
+    if token._.begins_fused:
+        n = 1
+        while token.nbor(n)._.inside_fused:
+            n += 1
+        id_ = '%d-%d' % (i, i+n)
+        lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
+    else:
+        lines = []
+    if token.head.i == token.i:
+        head = 0
+    else:
+        head = i + (token.head.i - token.i) + 1
+    fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
+              str(head), token.dep_.lower(), '_', '_']
+    lines.append('\t'.join(fields))
+    return '\n'.join(lines)
+
+Token.set_extension('get_conllu_lines', method=get_token_conllu)
+Token.set_extension('begins_fused', default=False)
+Token.set_extension('inside_fused', default=False)
+
+
+##################
+# Initialization #
+##################
+
+
+def load_nlp(corpus, config):
+    lang = corpus.split('_')[0]
+    nlp = spacy.blank(lang)
+    if config.vectors:
+        nlp.vocab.from_disk(config.vectors / 'vocab')
+    return nlp
+
+def initialize_pipeline(nlp, docs, golds, config):
+    nlp.add_pipe(nlp.create_pipe('parser'))
+    if config.multitask_tag:
+        nlp.parser.add_multitask_objective('tag')
+    if config.multitask_sent:
+        nlp.parser.add_multitask_objective('sent_start')
+    nlp.parser.moves.add_action(2, 'subtok')
+    nlp.add_pipe(nlp.create_pipe('tagger'))
+    for gold in golds:
+        for tag in gold.tags:
+            if tag is not None:
+                nlp.tagger.add_label(tag)
+    # Replace labels that didn't make the frequency cutoff
+    actions = set(nlp.parser.labels)
+    label_set = set([act.split('-')[1] for act in actions if '-' in act])
+    for gold in golds:
+        for i, label in enumerate(gold.labels):
+            if label is not None and label not in label_set:
+                gold.labels[i] = label.split('||')[0]
+    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
+
+
+########################
+# Command line helpers #
+########################
+
+@attr.s
+class Config(object):
+    vectors = attr.ib(default=None)
+    max_doc_length = attr.ib(default=10)
+    multitask_tag = attr.ib(default=True)
+    multitask_sent = attr.ib(default=True)
+    nr_epoch = attr.ib(default=30)
+    batch_size = attr.ib(default=1000)
+    dropout = attr.ib(default=0.2)
+
+    @classmethod
+    def load(cls, loc):
+        with Path(loc).open('r', encoding='utf8') as file_:
+            cfg = json.load(file_)
+        return cls(**cfg)
+
+
+class Dataset(object):
+    def __init__(self, path, section):
+        self.path = path
+        self.section = section
+        self.conllu = None
+        self.text = None
+        for file_path in self.path.iterdir():
+            name = file_path.parts[-1]
+            if section in name and name.endswith('conllu'):
+                self.conllu = file_path
+            elif section in name and name.endswith('txt'):
+                self.text = file_path
+        if self.conllu is None:
+            msg = "Could not find .txt file in {path} for {section}"
+            raise IOError(msg.format(section=section, path=path))
+        if self.text is None:
+            msg = "Could not find .txt file in {path} for {section}"
+        self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
+
+
+class TreebankPaths(object):
+    def __init__(self, ud_path, treebank, **cfg):
+        self.train = Dataset(ud_path / treebank, 'train')
+        self.dev = Dataset(ud_path / treebank, 'dev')
+        self.lang = self.train.lang
+
+
+@plac.annotations(
+    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+    corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+            "positional", None, str),
+    parses_dir=("Directory to write the development parses", "positional", None, Path),
+    config=("Path to json formatted config file", "positional", None, Config.load),
+    limit=("Size limit", "option", "n", int)
+)
+def main(ud_dir, parses_dir, config, corpus, limit=0):
+    paths = TreebankPaths(ud_dir, corpus)
+    if not (parses_dir / corpus).exists():
+        (parses_dir / corpus).mkdir()
+    print("Train and evaluate", corpus, "using lang", paths.lang)
+    nlp = load_nlp(paths.lang, config)
+
+    docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
+                            max_doc_length=config.max_doc_length, limit=limit)
+
+    optimizer = initialize_pipeline(nlp, docs, golds, config)
+
+    for i in range(config.nr_epoch):
+        docs = [nlp.make_doc(doc.text) for doc in docs]
+        batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
+        losses = {}
+        n_train_words = sum(len(doc) for doc in docs)
+        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
+            for batch in batches:
+                batch_docs, batch_gold = zip(*batch)
+                pbar.update(sum(len(doc) for doc in batch_docs))
+                nlp.update(batch_docs, batch_gold, sgd=optimizer,
+                           drop=config.dropout, losses=losses)
+        
+        out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
+        with nlp.use_params(optimizer.averages):
+            scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
+            print_progress(i, losses, scores)
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/examples/vectors_tensorboard_standalone.py
+++ b/examples/vectors_tensorboard_standalone.py
@ -1,88 +0,0 @@
-#!/usr/bin/env python
-# coding: utf8
-"""Export spaCy model vectors for use in TensorBoard's standalone embedding projector.
-https://github.com/tensorflow/embedding-projector-standalone
-
-Usage:
-
- python vectors_tensorboard_standalone.py ./myVectorModel ./output [name]
-
-This outputs two files that have to be copied into the "oss_data" of the standalone projector:
-
- [name]_labels.tsv - metadata such as human readable labels for vectors
- [name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors
-
-"""
-from __future__ import unicode_literals
-
-import json
-import math
-from os import path
-
-import numpy
-import plac
-import spacy
-import tqdm
-
-
-@plac.annotations(
-    vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
-    out_loc=("Path to output folder writing tensors and labels data", "positional", None, str),
-    name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
-)
-def main(vectors_loc, out_loc, name="spaCy_vectors"):
-    # A tab-separated file that contains information about the vectors for visualization
-    #
-    # Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata
-    meta_file = "{}_labels.tsv".format(name)
-    out_meta_file = path.join(out_loc, meta_file)
-
-    print('Loading spaCy vectors model: {}'.format(vectors_loc))
-    model = spacy.load(vectors_loc)
-
-    print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
-    voacb_strings = [
-        w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
-        if model.vocab.has_vector(w)
-    ]
-    vector_count = len(voacb_strings)
-
-    print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file))
-    vector_dimensions = model.vocab.vectors.shape[1]
-    tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32)
-
-    # Write a tab-separated file that contains information about the vectors for visualization
-    #
-    # Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
-    with open(out_meta_file, 'wb') as file_metadata:
-        # Define columns in the first row
-        file_metadata.write("Text\tFrequency\n".encode('utf-8'))
-        # Write out a row for each vector that we add to the tensorflow variable we created
-        vec_index = 0
-
-        for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False):
-            # https://github.com/tensorflow/tensorflow/issues/9094
-            text = '<Space>' if text.lstrip() == '' else text
-            lex = model.vocab[text]
-
-            # Store vector data and metadata
-            tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text))
-            file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8'))
-            vec_index += 1
-
-    # Write out "[name]_tensors.bytes" file for standalone embeddings projector to load
-    tensor_path = '{}_tensors.bytes'.format(name)
-    tf_vectors_variable.tofile(path.join(out_loc, tensor_path))
-
-    print('Done.')
-    print('Add the following entry to "oss_data/oss_demo_projector_config.json"')
-    print(json.dumps({
-        "tensorName": name,
-        "tensorShape": [vector_count, vector_dimensions],
-        "tensorPath": 'oss_data/{}'.format(tensor_path),
-        "metadataPath": 'oss_data/{}'.format(meta_file)
-    }, indent=2))
-
-
-if __name__ == '__main__':
-    plac.call(main)
--- a/fabfile.py
+++ b/fabfile.py
@ -1,49 +1,92 @@
 # coding: utf-8
 from __future__ import unicode_literals, print_function

+import contextlib
+from pathlib import Path
 from fabric.api import local, lcd, env, settings, prefix
-from fabtools.python import virtualenv
 from os import path, environ
+import shutil


 PWD = path.dirname(__file__)
 ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
-VENV_DIR = path.join(PWD, ENV)
+VENV_DIR = Path(PWD) / ENV


-def env(lang='python2.7'):
-    if path.exists(VENV_DIR):
+@contextlib.contextmanager
+def virtualenv(name, create=False, python='/usr/bin/python3.6'):
+    python = Path(python).resolve()
+    env_path = VENV_DIR
+    if create:
+        if env_path.exists():
+            shutil.rmtree(str(env_path))
+        local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
+    def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
+        return local('source {}/bin/activate && {}'.format(env_path, cmd),
+                     shell='/bin/bash', capture=False)
+    yield wrapped_local
+
+
+def env(lang='python3.6'):
+    if VENV_DIR.exists():
        local('rm -rf {env}'.format(env=VENV_DIR))
-    local('pip install virtualenv')
-    local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
+    if lang.startswith('python3'):
+        local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
+    else:
+        local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
+        local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
+    with virtualenv(VENV_DIR) as venv_local:
+        print(venv_local('python --version', capture=True))
+        venv_local('pip install --upgrade setuptools --no-cache-dir')
+        venv_local('pip install pytest --no-cache-dir')
+        venv_local('pip install wheel --no-cache-dir')
+        venv_local('pip install -r requirements.txt --no-cache-dir')
+        venv_local('pip install pex --no-cache-dir')
+


 def install():
-    with virtualenv(VENV_DIR):
-        local('pip install --upgrade setuptools')
-        local('pip install dist/*.tar.gz')
-        local('pip install pytest')
+    with virtualenv(VENV_DIR) as venv_local:
+        venv_local('pip install dist/*.tar.gz')


 def make():
-    with virtualenv(VENV_DIR):
-        with lcd(path.dirname(__file__)):
-            local('pip install cython')
-            local('pip install murmurhash')
-            local('pip install -r requirements.txt')
-            local('python setup.py build_ext --inplace')
+    with lcd(path.dirname(__file__)):
+        local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
+            shell='/bin/bash')

 def sdist():
-    with virtualenv(VENV_DIR):
+    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
            local('python setup.py sdist')

+def wheel():
+    with virtualenv(VENV_DIR) as venv_local:
+        with lcd(path.dirname(__file__)):
+            venv_local('python setup.py bdist_wheel')
+
+def pex():
+    with virtualenv(VENV_DIR) as venv_local:
+        with lcd(path.dirname(__file__)):
+            sha = local('git rev-parse --short HEAD', capture=True)
+            venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
+                direct=True)
+
+
 def clean():
    with lcd(path.dirname(__file__)):
-        local('python setup.py clean --all')
+        local('rm -f dist/*.whl')
+        local('rm -f dist/*.pex')
+        with virtualenv(VENV_DIR) as venv_local:
+            venv_local('python setup.py clean --all')


 def test():
-    with virtualenv(VENV_DIR):
+    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
-            local('py.test -x spacy/tests')
+            venv_local('pytest -x spacy/tests')
+
+def train():
+    args = environ.get('SPACY_TRAIN_ARGS', '')
+    with virtualenv(VENV_DIR) as venv_local:
+        venv_local('spacy train {args}'.format(args=args))
--- a/requirements.txt
+++ b/requirements.txt
@ -5,8 +5,8 @@ cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
 thinc>=6.11.1.dev10,<6.12.0
 murmurhash>=0.28,<0.29
+cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
-six
 ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.13.0,<3.0.0
@ -16,4 +16,3 @@ pytest>=3.0.6,<4.0.0
 mock>=2.0.0,<3.0.0
 msgpack-python==0.5.4
 msgpack-numpy==0.4.1
-html5lib==1.0b8
--- a/setup.py
+++ b/setup.py
@ -18,6 +18,7 @@ PACKAGES = find_packages()


 MOD_NAMES = [
+    'spacy._align',
    'spacy.parts_of_speech',
    'spacy.strings',
    'spacy.lexeme',
@ -191,8 +192,6 @@ def setup_package():
                'preshed>=1.0.0,<2.0.0',
                'thinc>=6.11.1.dev10,<6.12.0',
                'plac<1.0.0,>=0.9.6',
-                'six',
-                'html5lib==1.0b8',
                'pathlib',
                'ujson>=1.35',
                'dill>=0.2,<0.3',
@ -201,6 +200,7 @@ def setup_package():
                'ftfy>=4.4.2,<5.0.0',
                'msgpack-python==0.5.4',
                'msgpack-numpy==0.4.1'],
+            setup_requires=['wheel'],
            classifiers=[
                'Development Status :: 5 - Production/Stable',
                'Environment :: Console',
--- a/spacy/main.py
+++ b/spacy/main.py
@ -8,6 +8,7 @@ if __name__ == '__main__':
    import sys
    from spacy.cli import download, link, info, package, train, convert
    from spacy.cli import vocab, init_model, profile, evaluate, validate
+    from spacy.cli import ud_train, ud_evaluate
    from spacy.util import prints

    commands = {
@ -15,7 +16,9 @@ if __name__ == '__main__':
        'link': link,
        'info': info,
        'train': train,
+        'ud-train': ud_train,
        'evaluate': evaluate,
+        'ud-evaluate': ud_evaluate,
        'convert': convert,
        'package': package,
        'vocab': vocab,
--- a/spacy/_align.pyx
+++ b/spacy/_align.pyx
@ -0,0 +1,251 @@
+# cython: infer_types=True
+'''Do Levenshtein alignment, for evaluation of tokenized input.
+
+Random notes:
+
+  r i n g
+  0 1 2 3 4
+r 1 0 1 2 3
+a 2 1 1 2 3
+n 3 2 2 1 2
+g 4 3 3 2 1
+
+0,0: (1,1)=min(0+0,1+1,1+1)=0 S
+1,0: (2,1)=min(1+1,0+1,2+1)=1 D
+2,0: (3,1)=min(2+1,3+1,1+1)=2 D
+3,0: (4,1)=min(3+1,4+1,2+1)=3 D
+0,1: (1,2)=min(1+1,2+1,0+1)=1 D
+1,1: (2,2)=min(0+1,1+1,1+1)=1 S
+2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
+3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
+0,2: (1,3)=min(2+1,3+1,1+1)=2 I
+1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
+2,2: (3,3)
+3,2: (4,3)
+At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
+
+We know the costs to transition:
+
+S[:i]   -> T[:j]   (at D[i,j])
+S[:i+1] -> T[:j]   (at D[i+1,j])
+S[:i]   -> T[:j+1] (at D[i,j+1])
+    
+Further, we now we can tranform:
+S[:i+1] -> S[:i] (DEL) for 1,
+T[:j+1] -> T[:j] (INS) for 1.
+S[i+1]  -> T[j+1] (SUB) for 0 or 1
+
+Therefore we have the costs:
+SUB: Cost(S[:i]->T[:j])   + Cost(S[i]->S[j])
+i.e. D[i, j] + S[i+1] != T[j+1]
+INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
+i.e. D[i+1,j] + 1
+DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) 
+i.e. D[i,j+1] + 1
+
+    Source string S has length m, with index i
+    Target string T has length n, with index j
+
+    Output two alignment vectors: i2j (length m) and j2i (length n)
+    # function LevenshteinDistance(char s[1..m], char t[1..n]):
+    # for all i and j, d[i,j] will hold the Levenshtein distance between
+    # the first i characters of s and the first j characters of t
+    # note that d has (m+1)*(n+1) values
+    # set each element in d to zero
+    ring rang
+      - r i n g
+    - 0 0 0 0 0
+    r 0 0 0 0 0
+    a 0 0 0 0 0
+    n 0 0 0 0 0
+    g 0 0 0 0 0
+
+    # source prefixes can be transformed into empty string by
+    # dropping all characters
+    # d[i, 0] := i
+    ring rang
+      - r i n g
+    - 0 0 0 0 0
+    r 1 0 0 0 0
+    a 2 0 0 0 0
+    n 3 0 0 0 0
+    g 4 0 0 0 0
+
+    # target prefixes can be reached from empty source prefix
+    # by inserting every character
+    # d[0, j] := j
+      - r i n g
+    - 0 1 2 3 4
+    r 1 0 0 0 0
+    a 2 0 0 0 0
+    n 3 0 0 0 0
+    g 4 0 0 0 0
+
+'''
+from __future__ import unicode_literals
+from libc.stdint cimport uint32_t
+import numpy
+cimport numpy as np
+from .compat import unicode_
+from murmurhash.mrmr cimport hash32
+
+
+def align(S, T):
+    cdef int m = len(S)
+    cdef int n = len(T)
+    cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
+    cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
+    cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
+
+    cdef np.ndarray S_arr = _convert_sequence(S)
+    cdef np.ndarray T_arr = _convert_sequence(T)
+
+    fill_matrix(<int*>matrix.data,
+        <const int*>S_arr.data, m, <const int*>T_arr.data, n)
+    fill_i2j(i2j, matrix)
+    fill_j2i(j2i, matrix)
+    for i in range(i2j.shape[0]):
+        if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
+            i2j[i] = -1
+    for j in range(j2i.shape[0]):
+        if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
+            j2i[j] = -1
+    return matrix[-1,-1], i2j, j2i, matrix
+
+
+def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
+    '''Let's say we had:
+
+    Guess: [aa bb cc dd]
+    Truth: [aa bbcc dd]
+    i2j: [0, None, -2, 2]
+    j2i: [0, -2, 3]
+
+    We want:
+
+    i2j_multi: {1: 1, 2: 1}
+    j2i_multi: {}
+    '''
+    i2j_miss = _get_regions(i2j, i_lengths)
+    j2i_miss = _get_regions(j2i, j_lengths)
+
+    i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
+    return i2j_multi, j2i_multi
+
+
+def _get_regions(alignment, lengths):
+    regions = {}
+    start = None
+    offset = 0
+    for i in range(len(alignment)):
+        if alignment[i] < 0:
+            if start is None:
+                start = offset
+                regions.setdefault(start, [])
+            regions[start].append(i)
+        else:
+            start = None
+        offset += lengths[i]
+    return regions
+
+
+def _get_mapping(miss1, miss2, lengths1, lengths2):
+    i2j = {}
+    j2i = {}
+    for start, region1 in miss1.items():
+        if not region1 or start not in miss2:
+            continue
+        region2 = miss2[start]
+        if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
+            j = region2.pop(0)
+            buff = []
+            # Consume tokens from region 1, until we meet the length of the
+            # first token in region2. If we do, align the tokens. If
+            # we exceed the length, break.
+            while region1:
+                buff.append(region1.pop(0))
+                if sum(lengths1[i] for i in buff) == lengths2[j]:
+                    for i in buff:
+                        i2j[i] = j
+                    j2i[j] = buff[-1]
+                    j += 1
+                    buff = []
+                elif sum(lengths1[i] for i in buff) > lengths2[j]:
+                    break
+            else:
+                if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
+                    for i in buff:
+                        i2j[i] = j
+                    j2i[j] = buff[-1]
+    return i2j, j2i
+
+
+def _convert_sequence(seq):
+    if isinstance(seq, numpy.ndarray):
+        return numpy.ascontiguousarray(seq, dtype='uint32_t')
+    cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
+    cdef bytes item_bytes
+    for i, item in enumerate(seq):
+        if isinstance(item, unicode):
+            item_bytes = item.encode('utf8')
+        else:
+            item_bytes = item
+        output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
+    return output
+
+
+cdef void fill_matrix(int* D, 
+        const int* S, int m, const int* T, int n) nogil:
+    m1 = m+1
+    n1 = n+1
+    for i in range(m1*n1):
+        D[i] = 0
+ 
+    for i in range(m1):
+        D[i*n1] = i
+ 
+    for j in range(n1):
+        D[j] = j
+ 
+    cdef int sub_cost, ins_cost, del_cost
+    for j in range(n):
+        for i in range(m):
+            i_j = i*n1 + j
+            i1_j1 = (i+1)*n1 + j+1
+            i1_j = (i+1)*n1 + j
+            i_j1 = i*n1 + j+1
+            if S[i] != T[j]:
+                sub_cost = D[i_j] + 1
+            else:
+                sub_cost = D[i_j]
+            del_cost = D[i_j1] + 1
+            ins_cost = D[i1_j] + 1
+            best = min(min(sub_cost, ins_cost), del_cost)
+            D[i1_j1] = best
+
+
+cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
+    j = D.shape[1]-2
+    cdef int i = D.shape[0]-2
+    while i >= 0:
+        while D[i+1, j] < D[i+1, j+1]:
+            j -= 1
+        if D[i, j+1] < D[i+1, j+1]:
+            i2j[i] = -1
+        else:
+            i2j[i] = j
+            j -= 1
+        i -= 1
+
+cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
+    i = D.shape[0]-2
+    cdef int j = D.shape[1]-2
+    while j >= 0:
+        while D[i, j+1] < D[i+1, j+1]:
+            i -= 1
+        if D[i+1, j] < D[i+1, j+1]:
+            j2i[j] = -1
+        else:
+            j2i[j] = i
+            i -= 1
+        j -= 1
--- a/spacy/_matcher2_notes.py
+++ b/spacy/_matcher2_notes.py
@ -0,0 +1,251 @@
+import pytest
+
+
+class Vocab(object):
+    pass
+
+
+class Doc(list):
+    def __init__(self, vocab, words=None):
+        list.__init__(self)
+        self.extend([Token(i, w) for i, w in enumerate(words)])
+
+
+class Token(object):
+    def __init__(self, i, word):
+        self.i = i
+        self.text = word
+
+
+def find_matches(patterns, doc):
+    init_states = [(pattern, 0, None) for pattern in patterns]
+    curr_states = []
+    matches = []
+    for token in doc:
+        nexts = []
+        for state in (curr_states + init_states):
+            matches, nexts = transition(state, token, matches, nexts)
+        curr_states = nexts
+    return matches
+ 
+
+def transition(state, token, matches, nexts):
+    action = get_action(state, token)
+    is_match, keep_state, advance_state = [bool(int(c)) for c in action]
+    pattern, i, start = state
+    if start is None:
+        start = token.i
+    if is_match:
+        matches.append((pattern, start, token.i+1))
+    if advance_state:
+        nexts.append((pattern, i+1, start))
+    if keep_state:
+        # TODO: This needs to be zero-width :(.
+        nexts.append((pattern, i, start))
+    return (matches, nexts)
+
+
+def get_action(state, token):
+    '''We need to consider:
+
+    a) Does the token match the specification? [Yes, No]
+    b) What's the quantifier? [1, 0+, ?]
+    c) Is this the last specification? [final, non-final]
+
+    We can transition in the following ways:
+
+    a) Do we emit a match?
+    b) Do we add a state with (next state, next token)?
+    c) Do we add a state with (next state, same token)?
+    d) Do we add a state with (same state, next token)?
+
+    We'll code the actions as boolean strings, so 0000 means no to all 4,
+    1000 means match but no states added, etc.
+    
+    1:
+      Yes, final:
+        1000
+      Yes, non-final:
+        0100
+      No, final:
+        0000
+      No, non-final
+        0000
+    0+:
+      Yes, final:
+        1001
+      Yes, non-final:
+        0111
+      No, final:
+        1000 (note: Don't include last token!)
+      No, non-final:
+        0010
+    ?:
+      Yes, final:
+        1000
+      Yes, non-final:
+        0100
+      No, final:
+        1000 (note: Don't include last token!)
+      No, non-final:
+        0010
+
+    Problem: If a quantifier is matching, we're adding a lot of open partials
+    '''
+    is_match = get_is_match(state, token)
+    operator = get_operator(state, token)
+    is_final = get_is_final(state, token)
+    raise NotImplementedError
+
+
+def get_is_match(state, token):
+    pattern, i, start = state
+    is_match = token.text == pattern[i]['spec']
+    if pattern[i].get('invert'):
+        return not is_match
+    else:
+        return is_match
+
+def get_is_final(state, token):
+    pattern, i, start = state
+    return i == len(pattern)-1
+
+def get_operator(state, token):
+    pattern, i, start = state
+    return pattern[i].get('op', '1')
+
+
+########################
+# Tests for get_action #
+########################
+
+
+def test_get_action_simple_match():
+    pattern = [{'spec': 'a', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a'])
+    state = (pattern, 0, None)
+    action = get_action(state, doc[0])
+    assert action == '100'
+
+
+def test_get_action_simple_reject():
+    pattern = [{'spec': 'b', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a'])
+    state = (pattern, 0, None)
+    action = get_action(state, doc[0])
+    assert action == '000'
+
+
+def test_get_action_simple_match_match():
+    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a', 'a'])
+    state = (pattern, 0, None)
+    action = get_action(state, doc[0])
+    assert action == '001'
+    state = (pattern, 1, 0)
+    action = get_action(state, doc[1])
+    assert action == '100'
+
+
+def test_get_action_simple_match_reject():
+    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a', 'a'])
+    state = (pattern, 0, None)
+    action = get_action(state, doc[0])
+    assert action == '001'
+    state = (pattern, 1, 0)
+    action = get_action(state, doc[1])
+    assert action == '000'
+
+
+def test_get_action_simple_match_reject():
+    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a', 'a'])
+    state = (pattern, 0, None)
+    action = get_action(state, doc[0])
+    assert action == '001'
+    state = (pattern, 1, 0)
+    action = get_action(state, doc[1])
+    assert action == '000'
+
+
+def test_get_action_plus_match():
+    pattern = [{'spec': 'a', 'op': '1+'}]
+    doc = Doc(Vocab(), words=['a'])
+    state = (pattern, 0, None)
+    action = get_action(state, doc[0])
+    assert action == '110'
+
+
+def test_get_action_plus_match_match():
+    pattern = [{'spec': 'a', 'op': '1+'}]
+    doc = Doc(Vocab(), words=['a', 'a'])
+    state = (pattern, 0, None)
+    action = get_action(state, doc[0])
+    assert action == '110'
+    state = (pattern, 0, 0)
+    action = get_action(state, doc[1])
+    assert action == '110'
+
+
+##########################
+# Tests for find_matches #
+##########################
+
+def test_find_matches_simple_accept():
+    pattern = [{'spec': 'a', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a'])
+    matches = find_matches([pattern], doc)
+    assert matches == [(pattern, 0, 1)]
+
+
+def test_find_matches_simple_reject():
+    pattern = [{'spec': 'a', 'op': '1'}]
+    doc = Doc(Vocab(), words=['b'])
+    matches = find_matches([pattern], doc)
+    assert matches == []
+
+
+def test_find_matches_match_twice():
+    pattern = [{'spec': 'a', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a', 'a'])
+    matches = find_matches([pattern], doc)
+    assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
+
+
+def test_find_matches_longer_pattern():
+    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
+    doc = Doc(Vocab(), words=['a', 'b'])
+    matches = find_matches([pattern], doc)
+    assert matches == [(pattern, 0, 2)]
+
+
+def test_find_matches_two_patterns():
+    patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
+    doc = Doc(Vocab(), words=['a', 'b'])
+    matches = find_matches(patterns, doc)
+    assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
+
+
+def test_find_matches_two_patterns_overlap():
+    patterns = [[{'spec': 'a'}, {'spec': 'b'}],
+                [{'spec': 'b'}, {'spec': 'c'}]]
+    doc = Doc(Vocab(), words=['a', 'b', 'c'])
+    matches = find_matches(patterns, doc)
+    assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
+
+
+def test_find_matches_greedy():
+    patterns = [[{'spec': 'a', 'op': '1+'}]]
+    doc = Doc(Vocab(), words=['a'])
+    matches = find_matches(patterns, doc)
+    assert matches == [(patterns[0], 0, 1)]
+    doc = Doc(Vocab(), words=['a', 'a'])
+    matches = find_matches(patterns, doc)
+    assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
+
+def test_find_matches_non_greedy():
+    patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
+    doc = Doc(Vocab(), words=['b'])
+    matches = find_matches(patterns, doc)
+    assert matches == [(patterns[0], 0, 1)]
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -64,23 +64,6 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
    return (X, lengths), finish_update


-@layerize
-def _logistic(X, drop=0.):
-    xp = get_array_module(X)
-    if not isinstance(X, xp.ndarray):
-        X = xp.asarray(X)
-    # Clip to range (-10, 10)
-    X = xp.minimum(X, 10., X)
-    X = xp.maximum(X, -10., X)
-    Y = 1. / (1. + xp.exp(-X))
-
-    def logistic_bwd(dY, sgd=None):
-        dX = dY * (Y * (1-Y))
-        return dX
-
-    return Y, logistic_bwd
-
-
 def _zero_init(model):
    def _zero_init_impl(self, X, y):
        self.W.fill(0)
@ -144,8 +127,8 @@ class PrecomputableAffine(Model):
        self.nF = nF

    def begin_update(self, X, drop=0.):
-        Yf = self.ops.xp.dot(X,
-            self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
+        Yf = self.ops.gemm(X,
+            self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
        Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
        Yf = self._add_padding(Yf)

@ -161,11 +144,11 @@ class PrecomputableAffine(Model):
            Wopfi = self.W.transpose((1, 2, 0, 3))
            Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
            Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
-            dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
+            dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)

            # Reuse the buffer
            dWopfi = Wopfi; dWopfi.fill(0.)
-            self.ops.xp.dot(dY.T, Xf, out=dWopfi)
+            self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
            dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
            # (o, p, f, i) --> (f, o, p, i)
            self.d_W += dWopfi.transpose((2, 0, 1, 3))
@ -467,6 +450,7 @@ def SpacyVectors(docs, drop=0.):


 def build_text_classifier(nr_class, width=64, **cfg):
+    depth = cfg.get('depth', 2)
    nr_vector = cfg.get('nr_vector', 5000)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
@ -518,7 +502,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
                LN(Maxout(width, vectors_width))
                >> Residual(
                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
-                ) ** 2, pad=2
+                ) ** depth, pad=depth
            )
            >> flatten_add_lengths
            >> ParametricAttention(width)
@ -531,8 +515,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
            _preprocess_doc
            >> LinearModel(nr_class)
        )
-        #model = linear_model >> logistic
-
        model = (
            (linear_model | cnn_model)
            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
--- a/spacy/about.py
+++ b/spacy/about.py
@ -9,7 +9,7 @@ __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
 __email__ = 'contact@explosion.ai'
 __license__ = 'MIT'
-__release__ = True
+__release__ = False

 __docs_models__ = 'https://spacy.io/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -131,7 +131,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
            'NumValue', 'PartType', 'Polite', 'StyleVariant',
            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
-            'Polarity', 'Animacy' # U20
+            'Polarity', 'PrepCase', 'Animacy' # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -9,3 +9,5 @@ from .convert import convert
 from .vocab import make_vocab as vocab
 from .init_model import init_model
 from .validate import validate
+from .ud_train import main as ud_train
+from .conll17_ud_eval import main as ud_evaluate
--- a/spacy/cli/conll17_ud_eval.py
+++ b/spacy/cli/conll17_ud_eval.py
@ -0,0 +1,571 @@
+#!/usr/bin/env python
+
+# CoNLL 2017 UD Parsing evaluation script.
+#
+# Compatible with Python 2.7 and 3.2+, can be used either as a module
+# or a standalone executable.
+#
+# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
+# Faculty of Mathematics and Physics, Charles University, Czech Republic.
+#
+# Changelog:
+# - [02 Jan 2017] Version 0.9: Initial release
+# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
+# - [10 Mar 2017] Version 1.0: Add documentation and test
+#                              Compare HEADs correctly using aligned words
+#                              Allow evaluation with errorneous spaces in forms
+#                              Compare forms in LCS case insensitively
+#                              Detect cycles and multiple root nodes
+#                              Compute AlignedAccuracy
+
+# Command line usage
+# ------------------
+# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
+#
+# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
+#   is printed
+# - if -v is given, several metrics are printed (as precision, recall, F1 score,
+#   and in case the metric is computed on aligned words also accuracy on these):
+#   - Tokens: how well do the gold tokens match system tokens
+#   - Sentences: how well do the gold sentences match system sentences
+#   - Words: how well can the gold words be aligned to system words
+#   - UPOS: using aligned words, how well does UPOS match
+#   - XPOS: using aligned words, how well does XPOS match
+#   - Feats: using aligned words, how well does FEATS match
+#   - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
+#   - Lemmas: using aligned words, how well does LEMMA match
+#   - UAS: using aligned words, how well does HEAD match
+#   - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
+# - if weights_file is given (with lines containing deprel-weight pairs),
+#   one more metric is shown:
+#   - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
+
+# API usage
+# ---------
+# - load_conllu(file)
+#   - loads CoNLL-U file from given file object to an internal representation
+#   - the file object should return str on both Python 2 and Python 3
+#   - raises UDError exception if the given file cannot be loaded
+# - evaluate(gold_ud, system_ud)
+#   - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
+#   - raises UDError if the concatenated tokens of gold and system file do not match
+#   - returns a dictionary with the metrics described above, each metrics having
+#     three fields: precision, recall and f1
+
+# Description of token matching
+# -----------------------------
+# In order to match tokens of gold file and system file, we consider the text
+# resulting from concatenation of gold tokens and text resulting from
+# concatenation of system tokens. These texts should match -- if they do not,
+# the evaluation fails.
+#
+# If the texts do match, every token is represented as a range in this original
+# text, and tokens are equal only if their range is the same.
+
+# Description of word matching
+# ----------------------------
+# When matching words of gold file and system file, we first match the tokens.
+# The words which are also tokens are matched as tokens, but words in multi-word
+# tokens have to be handled differently.
+#
+# To handle multi-word tokens, we start by finding "multi-word spans".
+# Multi-word span is a span in the original text such that
+# - it contains at least one multi-word token
+# - all multi-word tokens in the span (considering both gold and system ones)
+#   are completely inside the span (i.e., they do not "stick out")
+# - the multi-word span is as small as possible
+#
+# For every multi-word span, we align the gold and system words completely
+# inside this span using LCS on their FORMs. The words not intersecting
+# (even partially) any multi-word span are then aligned as tokens.
+
+
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import io
+import sys
+import unittest
+
+# CoNLL-U column names
+ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
+
+# UD Error is used when raising exceptions in this module
+class UDError(Exception):
+    pass
+
+# Load given CoNLL-U file into internal representation
+def load_conllu(file):
+    # Internal representation classes
+    class UDRepresentation:
+        def __init__(self):
+            # Characters of all the tokens in the whole file.
+            # Whitespace between tokens is not included.
+            self.characters = []
+            # List of UDSpan instances with start&end indices into `characters`.
+            self.tokens = []
+            # List of UDWord instances.
+            self.words = []
+            # List of UDSpan instances with start&end indices into `characters`.
+            self.sentences = []
+    class UDSpan:
+        def __init__(self, start, end, characters):
+            self.start = start
+            # Note that self.end marks the first position **after the end** of span,
+            # so we can use characters[start:end] or range(start, end).
+            self.end = end
+            self.characters = characters
+
+        @property
+        def text(self):
+            return ''.join(self.characters[self.start:self.end])
+
+        def __str__(self):
+            return self.text
+
+        def __repr__(self):
+            return self.text
+    class UDWord:
+        def __init__(self, span, columns, is_multiword):
+            # Span of this word (or MWT, see below) within ud_representation.characters.
+            self.span = span
+            # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
+            self.columns = columns
+            # is_multiword==True means that this word is part of a multi-word token.
+            # In that case, self.span marks the span of the whole multi-word token.
+            self.is_multiword = is_multiword
+            # Reference to the UDWord instance representing the HEAD (or None if root).
+            self.parent = None
+            # Let's ignore language-specific deprel subtypes.
+            self.columns[DEPREL] = columns[DEPREL].split(':')[0]
+
+    ud = UDRepresentation()
+
+    # Load the CoNLL-U file
+    index, sentence_start = 0, None
+    linenum = 0
+    while True:
+        line = file.readline()
+        linenum += 1
+        if not line:
+            break
+        line = line.rstrip("\r\n")
+
+        # Handle sentence start boundaries
+        if sentence_start is None:
+            # Skip comments
+            if line.startswith("#"):
+                continue
+            # Start a new sentence
+            ud.sentences.append(UDSpan(index, 0, ud.characters))
+            sentence_start = len(ud.words)
+        if not line:
+            # Add parent UDWord links and check there are no cycles
+            def process_word(word):
+                if word.parent == "remapping":
+                    raise UDError("There is a cycle in a sentence")
+                if word.parent is None:
+                    head = int(word.columns[HEAD])
+                    if head > len(ud.words) - sentence_start:
+                        raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
+                            linenum, word.columns[HEAD]))
+                    if head:
+                        parent = ud.words[sentence_start + head - 1]
+                        word.parent = "remapping"
+                        process_word(parent)
+                        word.parent = parent
+
+            for word in ud.words[sentence_start:]:
+                process_word(word)
+
+            # Check there is a single root node
+            if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
+                raise UDError("There are multiple roots in a sentence")
+
+            # End the sentence
+            ud.sentences[-1].end = index
+            sentence_start = None
+            continue
+
+        # Read next token/word
+        columns = line.split("\t")
+        if len(columns) != 10:
+            raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
+
+        # Skip empty nodes
+        if "." in columns[ID]:
+            continue
+
+        # Delete spaces from FORM  so gold.characters == system.characters
+        # even if one of them tokenizes the space.
+        columns[FORM] = columns[FORM].replace(" ", "")
+        if not columns[FORM]:
+            raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
+
+        # Save token
+        ud.characters.extend(columns[FORM])
+        ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
+        index += len(columns[FORM])
+
+        # Handle multi-word tokens to save word(s)
+        if "-" in columns[ID]:
+            try:
+                start, end = map(int, columns[ID].split("-"))
+            except:
+                raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
+            
+            for _ in range(start, end + 1):
+                word_line = file.readline().rstrip("\r\n")
+                word_columns = word_line.split("\t")
+                if len(word_columns) != 10:
+                    print(columns)
+                    raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
+                ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
+        # Basic tokens/words
+        else:
+            try:
+                word_id = int(columns[ID])
+            except:
+                raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
+            if word_id != len(ud.words) - sentence_start + 1:
+                raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
+
+            try:
+                head_id = int(columns[HEAD])
+            except:
+                raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
+            if head_id < 0:
+                raise UDError("HEAD cannot be negative")
+
+            ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
+
+    if sentence_start is not None:
+        raise UDError("The CoNLL-U file does not end with empty line")
+
+    return ud
+
+# Evaluate the gold and system treebanks (loaded using load_conllu).
+def evaluate(gold_ud, system_ud, deprel_weights=None):
+    class Score:
+        def __init__(self, gold_total, system_total, correct, aligned_total=None):
+            self.precision = correct / system_total if system_total else 0.0
+            self.recall = correct / gold_total if gold_total else 0.0
+            self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
+            self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
+    class AlignmentWord:
+        def __init__(self, gold_word, system_word):
+            self.gold_word = gold_word
+            self.system_word = system_word
+            self.gold_parent = None
+            self.system_parent_gold_aligned = None
+    class Alignment:
+        def __init__(self, gold_words, system_words):
+            self.gold_words = gold_words
+            self.system_words = system_words
+            self.matched_words = []
+            self.matched_words_map = {}
+        def append_aligned_words(self, gold_word, system_word):
+            self.matched_words.append(AlignmentWord(gold_word, system_word))
+            self.matched_words_map[system_word] = gold_word
+        def fill_parents(self):
+            # We represent root parents in both gold and system data by '0'.
+            # For gold data, we represent non-root parent by corresponding gold word.
+            # For system data, we represent non-root parent by either gold word aligned
+            # to parent system nodes, or by None if no gold words is aligned to the parent.
+            for words in self.matched_words:
+                words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
+                words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
+                    if words.system_word.parent is not None else 0
+
+    def lower(text):
+        if sys.version_info < (3, 0) and isinstance(text, str):
+            return text.decode("utf-8").lower()
+        return text.lower()
+
+    def spans_score(gold_spans, system_spans):
+        correct, gi, si = 0, 0, 0
+        while gi < len(gold_spans) and si < len(system_spans):
+            if system_spans[si].start < gold_spans[gi].start:
+                si += 1
+            elif gold_spans[gi].start < system_spans[si].start:
+                gi += 1
+            else:
+                correct += gold_spans[gi].end == system_spans[si].end
+                si += 1
+                gi += 1
+
+        return Score(len(gold_spans), len(system_spans), correct)
+
+    def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
+        gold, system, aligned, correct = 0, 0, 0, 0
+
+        for word in alignment.gold_words:
+            gold += weight_fn(word)
+
+        for word in alignment.system_words:
+            system += weight_fn(word)
+
+        for words in alignment.matched_words:
+            aligned += weight_fn(words.gold_word)
+
+        if key_fn is None:
+            # Return score for whole aligned words
+            return Score(gold, system, aligned)
+
+        for words in alignment.matched_words:
+            if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
+                correct += weight_fn(words.gold_word)
+
+        return Score(gold, system, correct, aligned)
+
+    def beyond_end(words, i, multiword_span_end):
+        if i >= len(words):
+            return True
+        if words[i].is_multiword:
+            return words[i].span.start >= multiword_span_end
+        return words[i].span.end > multiword_span_end
+
+    def extend_end(word, multiword_span_end):
+        if word.is_multiword and word.span.end > multiword_span_end:
+            return word.span.end
+        return multiword_span_end
+
+    def find_multiword_span(gold_words, system_words, gi, si):
+        # We know gold_words[gi].is_multiword or system_words[si].is_multiword.
+        # Find the start of the multiword span (gs, ss), so the multiword span is minimal.
+        # Initialize multiword_span_end characters index.
+        if gold_words[gi].is_multiword:
+            multiword_span_end = gold_words[gi].span.end
+            if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
+                si += 1
+        else: # if system_words[si].is_multiword
+            multiword_span_end = system_words[si].span.end
+            if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
+                gi += 1
+        gs, ss = gi, si
+
+        # Find the end of the multiword span
+        # (so both gi and si are pointing to the word following the multiword span end).
+        while not beyond_end(gold_words, gi, multiword_span_end) or \
+              not beyond_end(system_words, si, multiword_span_end):
+            if gi < len(gold_words) and (si >= len(system_words) or
+                                         gold_words[gi].span.start <= system_words[si].span.start):
+                multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
+                gi += 1
+            else:
+                multiword_span_end = extend_end(system_words[si], multiword_span_end)
+                si += 1
+        return gs, ss, gi, si
+
+    def compute_lcs(gold_words, system_words, gi, si, gs, ss):
+        lcs = [[0] * (si - ss) for i in range(gi - gs)]
+        for g in reversed(range(gi - gs)):
+            for s in reversed(range(si - ss)):
+                if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
+                    lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
+                lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
+                lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
+        return lcs
+
+    def align_words(gold_words, system_words):
+        alignment = Alignment(gold_words, system_words)
+
+        gi, si = 0, 0
+        while gi < len(gold_words) and si < len(system_words):
+            if gold_words[gi].is_multiword or system_words[si].is_multiword:
+                # A: Multi-word tokens => align via LCS within the whole "multiword span".
+                gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
+
+                if si > ss and gi > gs:
+                    lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
+
+                    # Store aligned words
+                    s, g = 0, 0
+                    while g < gi - gs and s < si - ss:
+                        if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
+                            alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
+                            g += 1
+                            s += 1
+                        elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
+                            g += 1
+                        else:
+                            s += 1
+            else:
+                # B: No multi-word token => align according to spans.
+                if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
+                    alignment.append_aligned_words(gold_words[gi], system_words[si])
+                    gi += 1
+                    si += 1
+                elif gold_words[gi].span.start <= system_words[si].span.start:
+                    gi += 1
+                else:
+                    si += 1
+
+        alignment.fill_parents()
+
+        return alignment
+
+    # Check that underlying character sequences do match
+    if gold_ud.characters != system_ud.characters:
+        index = 0
+        while gold_ud.characters[index] == system_ud.characters[index]:
+            index += 1
+
+        raise UDError(
+            "The concatenation of tokens in gold file and in system file differ!\n" +
+            "First 20 differing characters in gold file: '{}' and system file: '{}'".format(
+                "".join(gold_ud.characters[index:index + 20]),
+                "".join(system_ud.characters[index:index + 20])
+            )
+        )
+
+    # Align words
+    alignment = align_words(gold_ud.words, system_ud.words)
+
+    # Compute the F1-scores
+    result = {
+        "Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
+        "Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
+        "Words": alignment_score(alignment, None),
+        "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
+        "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
+        "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
+        "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
+        "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
+        "UAS": alignment_score(alignment, lambda w, parent: parent),
+        "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
+    }
+
+    # Add WeightedLAS if weights are given
+    if deprel_weights is not None:
+        def weighted_las(word):
+            return deprel_weights.get(word.columns[DEPREL], 1.0)
+        result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
+
+    return result
+
+def load_deprel_weights(weights_file):
+    if weights_file is None:
+        return None
+
+    deprel_weights = {}
+    for line in weights_file:
+        # Ignore comments and empty lines
+        if line.startswith("#") or not line.strip():
+            continue
+
+        columns = line.rstrip("\r\n").split()
+        if len(columns) != 2:
+            raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
+
+        deprel_weights[columns[0]] = float(columns[1])
+
+    return deprel_weights
+
+def load_conllu_file(path):
+    _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
+    return load_conllu(_file)
+
+def evaluate_wrapper(args):
+    # Load CoNLL-U files
+    gold_ud = load_conllu_file(args.gold_file)
+    system_ud = load_conllu_file(args.system_file)
+
+    # Load weights if requested
+    deprel_weights = load_deprel_weights(args.weights)
+
+    return evaluate(gold_ud, system_ud, deprel_weights)
+
+def main():
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("gold_file", type=str,
+                        help="Name of the CoNLL-U file with the gold data.")
+    parser.add_argument("system_file", type=str,
+                        help="Name of the CoNLL-U file with the predicted data.")
+    parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
+                        metavar="deprel_weights_file",
+                        help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
+    parser.add_argument("--verbose", "-v", default=0, action="count",
+                        help="Print all metrics.")
+    args = parser.parse_args()
+
+    # Use verbose if weights are supplied
+    if args.weights is not None and not args.verbose:
+        args.verbose = 1
+
+    # Evaluate
+    evaluation = evaluate_wrapper(args)
+
+    # Print the evaluation
+    if not args.verbose:
+        print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
+    else:
+        metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
+        if args.weights is not None:
+            metrics.append("WeightedLAS")
+
+        print("Metrics    | Precision |    Recall |  F1 Score | AligndAcc")
+        print("-----------+-----------+-----------+-----------+-----------")
+        for metric in metrics:
+            print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
+                metric,
+                100 * evaluation[metric].precision,
+                100 * evaluation[metric].recall,
+                100 * evaluation[metric].f1,
+                "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
+            ))
+
+if __name__ == "__main__":
+    main()
+
+# Tests, which can be executed with `python -m unittest conll17_ud_eval`.
+class TestAlignment(unittest.TestCase):
+    @staticmethod
+    def _load_words(words):
+        """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
+        lines, num_words = [], 0
+        for w in words:
+            parts = w.split(" ")
+            if len(parts) == 1:
+                num_words += 1
+                lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
+            else:
+                lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
+                for part in parts[1:]:
+                    num_words += 1
+                    lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
+        return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
+
+    def _test_exception(self, gold, system):
+        self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
+
+    def _test_ok(self, gold, system, correct):
+        metrics = evaluate(self._load_words(gold), self._load_words(system))
+        gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
+        system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
+        self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
+                         (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
+
+    def test_exception(self):
+        self._test_exception(["a"], ["b"])
+
+    def test_equal(self):
+        self._test_ok(["a"], ["a"], 1)
+        self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
+
+    def test_equal_with_multiword(self):
+        self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
+        self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
+        self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
+        self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
+
+    def test_alignment(self):
+        self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
+        self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
+        self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
+        self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
+        self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
+        self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
+        self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -8,8 +8,8 @@ from thinc.neural._classes.model import Model
 from timeit import default_timer as timer

 from ..attrs import PROB, IS_OOV, CLUSTER, LANG
-from ..gold import GoldCorpus, minibatch
-from ..util import prints
+from ..gold import GoldCorpus
+from ..util import prints, minibatch, minibatch_by_words
 from .. import util
 from .. import about
 from .. import displacy
@ -51,8 +51,6 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
-    if not output_path.exists():
-        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title="Training data not found", exits=1)
    if dev_path and not dev_path.exists():
@ -65,7 +63,14 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
               title="Not a valid meta.json format", exits=1)
    meta.setdefault('lang', lang)
    meta.setdefault('name', 'unnamed')
+    
+    if not output_path.exists():
+        output_path.mkdir()

+    print("Counting training words (limit=%s" % n_sents)
+    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
+    n_train_words = corpus.count_train()
+    print(n_train_words)
    pipeline = ['tagger', 'parser', 'ner']
    if no_tagger and 'tagger' in pipeline:
        pipeline.remove('tagger')
@ -81,13 +86,9 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
-    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
-                                   util.env_opt('batch_to', 16),
+    batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
+                                   util.env_opt('batch_to', 1000),
                                   util.env_opt('batch_compound', 1.001))
-    max_doc_len = util.env_opt('max_doc_len', 5000)
-    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
-    n_train_words = corpus.count_train()
-
    lang_class = util.get_lang_class(lang)
    nlp = lang_class()
    meta['pipeline'] = pipeline
@ -105,6 +106,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
            lex.is_oov = False
    for name in pipeline:
        nlp.add_pipe(nlp.create_pipe(name), name=name)
+    nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
    if parser_multitasks:
        for objective in parser_multitasks.split(','):
            nlp.parser.add_multitask_objective(objective)
@ -116,21 +118,20 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,

    print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
    try:
-        train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
-                                       gold_preproc=gold_preproc, max_length=0)
-        train_docs = list(train_docs)
        for i in range(n_iter):
+            train_docs = corpus.train_docs(nlp, noise_level=0.0,
+                                           gold_preproc=gold_preproc, max_length=0)
+            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
-                for batch in minibatch(train_docs, size=batch_sizes):
-                    batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
+                for batch in minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))
-
+                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import six
 import ftfy
 import sys
 import ujson
@ -47,9 +46,10 @@ is_windows = sys.platform.startswith('win')
 is_linux = sys.platform.startswith('linux')
 is_osx = sys.platform == 'darwin'

-is_python2 = six.PY2
-is_python3 = six.PY3
-is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1]<5)
+# See: https://github.com/benjaminp/six/blob/master/six.py
+is_python2 = sys.version_info[0] == 2
+is_python3 = sys.version_info[0] == 3
+is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)

 if is_python2:
    bytes_ = str
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -3,16 +3,25 @@
 from __future__ import unicode_literals, print_function

 import re
-import ujson
 import random
 import cytoolz
 import itertools
+import numpy
+import tempfile
+import shutil
+from pathlib import Path
+import msgpack

+import ujson
+
+from . import _align 
 from .syntax import nonproj
 from .tokens import Doc
 from . import util
-from .util import minibatch
+from .util import minibatch, itershuffle
+from .compat import json_dumps

+from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek

 def tags_to_entities(tags):
    entities = []
@ -59,196 +68,62 @@ def merge_sents(sents):
    return [(m_deps, m_brackets)]


-def align(cand_words, gold_words):
-    cost, edit_path = _min_edit_path(cand_words, gold_words)
-    alignment = []
-    i_of_gold = 0
-    for move in edit_path:
-        if move == 'M':
-            alignment.append(i_of_gold)
-            i_of_gold += 1
-        elif move == 'S':
-            alignment.append(None)
-            i_of_gold += 1
-        elif move == 'D':
-            alignment.append(None)
-        elif move == 'I':
-            i_of_gold += 1
-        else:
-            raise Exception(move)
-    return alignment
-
-
 punct_re = re.compile(r'\W')
-
-
-def _min_edit_path(cand_words, gold_words):
-    cdef:
-        Pool mem
-        int i, j, n_cand, n_gold
-        int* curr_costs
-        int* prev_costs
-
-    # TODO: Fix this --- just do it properly, make the full edit matrix and
-    # then walk back over it...
-    # Preprocess inputs
-    cand_words = [punct_re.sub('', w).lower() for w in cand_words]
-    gold_words = [punct_re.sub('', w).lower() for w in gold_words]
-
+def align(cand_words, gold_words):
    if cand_words == gold_words:
-        return 0, ''.join(['M' for _ in gold_words])
-    mem = Pool()
-    n_cand = len(cand_words)
-    n_gold = len(gold_words)
-    # Levenshtein distance, except we need the history, and we may want
-    # different costs. Mark operations with a string, and score the history
-    # using _edit_cost.
-    previous_row = []
-    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
-    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
-    for i in range(n_gold + 1):
-        cell = ''
-        for j in range(i):
-            cell += 'I'
-        previous_row.append('I' * i)
-        prev_costs[i] = i
-    for i, cand in enumerate(cand_words):
-        current_row = ['D' * (i + 1)]
-        curr_costs[0] = i+1
-        for j, gold in enumerate(gold_words):
-            if gold.lower() == cand.lower():
-                s_cost = prev_costs[j]
-                i_cost = curr_costs[j] + 1
-                d_cost = prev_costs[j + 1] + 1
-            else:
-                s_cost = prev_costs[j] + 1
-                i_cost = curr_costs[j] + 1
-                d_cost = prev_costs[j + 1] + (1 if cand else 0)
-
-            if s_cost <= i_cost and s_cost <= d_cost:
-                best_cost = s_cost
-                best_hist = previous_row[j] + ('M' if gold == cand else 'S')
-            elif i_cost <= s_cost and i_cost <= d_cost:
-                best_cost = i_cost
-                best_hist = current_row[j] + 'I'
-            else:
-                best_cost = d_cost
-                best_hist = previous_row[j + 1] + 'D'
-
-            current_row.append(best_hist)
-            curr_costs[j+1] = best_cost
-        previous_row = current_row
-        for j in range(len(gold_words) + 1):
-            prev_costs[j] = curr_costs[j]
-            curr_costs[j] = 0
-
-    return prev_costs[n_gold], previous_row[-1]
+        alignment = numpy.arange(len(cand_words))
+        return 0, alignment, alignment, {}, {}
+    cand_words = [w.replace(' ', '') for w in cand_words]
+    gold_words = [w.replace(' ', '') for w in gold_words]
+    cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
+    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
+                                [len(w) for w in gold_words])
+    for i, j in list(i2j_multi.items()):
+        if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
+            i2j[i] = j
+            i2j_multi.pop(i)
+    for j, i in list(j2i_multi.items()):
+        if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
+            j2i[j] = i
+            j2i_multi.pop(j)
+    return cost, i2j, j2i, i2j_multi, j2i_multi


 class GoldCorpus(object):
    """An annotated corpus, using the JSON file format. Manages
    annotations for tagging, dependency parsing and NER."""
-    def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
+    def __init__(self, train, dev, gold_preproc=False, limit=None):
        """Create a GoldCorpus.

        train_path (unicode or Path): File or directory of training data.
        dev_path (unicode or Path): File or directory of development data.
        RETURNS (GoldCorpus): The newly created object.
        """
-        self.train_path = util.ensure_path(train_path)
-        self.dev_path = util.ensure_path(dev_path)
        self.limit = limit
-        self.train_locs = self.walk_corpus(self.train_path)
-        self.dev_locs = self.walk_corpus(self.dev_path)
+        if isinstance(train, str) or isinstance(train, Path):
+            train = self.read_tuples(self.walk_corpus(train))
+            dev = self.read_tuples(self.walk_corpus(dev))

-    @property
-    def train_tuples(self):
-        i = 0
-        for loc in self.train_locs:
-            gold_tuples = read_json_file(loc)
-            for item in gold_tuples:
-                yield item
-                i += len(item[1])
-                if self.limit and i >= self.limit:
-                    break
+        # Write temp directory with one doc per file, so we can shuffle
+        # and stream
+        self.tmp_dir = Path(tempfile.mkdtemp())
+        self.write_msgpack(self.tmp_dir / 'train', train)
+        self.write_msgpack(self.tmp_dir / 'dev', dev)

-    @property
-    def dev_tuples(self):
-        i = 0
-        for loc in self.dev_locs:
-            gold_tuples = read_json_file(loc)
-            for item in gold_tuples:
-                yield item
-                i += len(item[1])
-                if self.limit and i >= self.limit:
-                    break
-
-    def count_train(self):
-        n = 0
-        i = 0
-        for raw_text, paragraph_tuples in self.train_tuples:
-            n += sum([len(s[0][1]) for s in paragraph_tuples])
-            if self.limit and i >= self.limit:
-                break
-            i += len(paragraph_tuples)
-        return n
-
-    def train_docs(self, nlp, gold_preproc=False,
-                   projectivize=False, max_length=None,
-                   noise_level=0.0):
-        train_tuples = self.train_tuples
-        if projectivize:
-            train_tuples = nonproj.preprocess_training_data(
-                self.train_tuples, label_freq_cutoff=100)
-        random.shuffle(train_tuples)
-        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
-                                        max_length=max_length,
-                                        noise_level=noise_level)
-        yield from gold_docs
-
-    def dev_docs(self, nlp, gold_preproc=False):
-        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
-        yield from gold_docs
-
-    @classmethod
-    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
-                       noise_level=0.0):
-        for raw_text, paragraph_tuples in tuples:
-            if gold_preproc:
-                raw_text = None
-            else:
-                paragraph_tuples = merge_sents(paragraph_tuples)
-            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
-                                  gold_preproc, noise_level=noise_level)
-            golds = cls._make_golds(docs, paragraph_tuples)
-            for doc, gold in zip(docs, golds):
-                if (not max_length) or len(doc) < max_length:
-                    yield doc, gold
-
-    @classmethod
-    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
-                   noise_level=0.0):
-        if raw_text is not None:
-            raw_text = add_noise(raw_text, noise_level)
-            return [nlp.make_doc(raw_text)]
-        else:
-            return [Doc(nlp.vocab,
-                        words=add_noise(sent_tuples[1], noise_level))
-                    for (sent_tuples, brackets) in paragraph_tuples]
-
-    @classmethod
-    def _make_golds(cls, docs, paragraph_tuples):
-        assert len(docs) == len(paragraph_tuples)
-        if len(docs) == 1:
-            return [GoldParse.from_annot_tuples(docs[0],
-                                                paragraph_tuples[0][0])]
-        else:
-            return [GoldParse.from_annot_tuples(doc, sent_tuples)
-                    for doc, (sent_tuples, brackets)
-                    in zip(docs, paragraph_tuples)]
+    def __del__(self):
+        shutil.rmtree(self.tmp_dir)

+    @staticmethod
+    def write_msgpack(directory, doc_tuples):
+        if not directory.exists():
+            directory.mkdir()
+        for i, doc_tuple in enumerate(doc_tuples):
+            with open(directory / '{}.msg'.format(i), 'wb') as file_:
+                msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8')
+    
    @staticmethod
    def walk_corpus(path):
+        path = util.ensure_path(path)
        if not path.is_dir():
            return [path]
        paths = [path]
@ -266,6 +141,101 @@ class GoldCorpus(object):
                locs.append(path)
        return locs

+    @staticmethod
+    def read_tuples(locs, limit=0):
+        i = 0
+        for loc in locs:
+            loc = util.ensure_path(loc)
+            if loc.parts[-1].endswith('json'):
+                gold_tuples = read_json_file(loc)
+            elif loc.parts[-1].endswith('msg'):
+                with loc.open('rb') as file_:
+                    gold_tuples = msgpack.load(file_, encoding='utf8')
+            else:
+                msg = "Cannot read from file: %s. Supported formats: .json, .msg"
+                raise ValueError(msg % loc)
+            for item in gold_tuples:
+                yield item
+                i += len(item[1])
+                if limit and i >= limit:
+                    break
+
+    @property
+    def dev_tuples(self):
+        locs = (self.tmp_dir / 'dev').iterdir()
+        yield from self.read_tuples(locs, limit=self.limit)
+   
+    @property
+    def train_tuples(self):
+        locs = (self.tmp_dir / 'train').iterdir()
+        yield from self.read_tuples(locs, limit=self.limit)
+
+    def count_train(self):
+        n = 0
+        i = 0
+        for raw_text, paragraph_tuples in self.train_tuples:
+            for sent_tuples, brackets in paragraph_tuples:
+                n += len(sent_tuples[1])
+            if self.limit and i >= self.limit:
+                break
+            i += len(paragraph_tuples)
+        return n
+
+    def train_docs(self, nlp, gold_preproc=False, max_length=None,
+                    noise_level=0.0):
+        locs = list((self.tmp_dir / 'train').iterdir())
+        random.shuffle(locs)
+        train_tuples = self.read_tuples(locs, limit=self.limit)
+        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
+                                        max_length=max_length,
+                                        noise_level=noise_level,
+                                        make_projective=True)
+        yield from gold_docs
+
+    def dev_docs(self, nlp, gold_preproc=False):
+        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples,
+                                        gold_preproc=gold_preproc)
+        yield from gold_docs
+
+    @classmethod
+    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
+                       noise_level=0.0, make_projective=False):
+        for raw_text, paragraph_tuples in tuples:
+            if gold_preproc:
+                raw_text = None
+            else:
+                paragraph_tuples = merge_sents(paragraph_tuples)
+            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
+                                  gold_preproc, noise_level=noise_level)
+            golds = cls._make_golds(docs, paragraph_tuples, make_projective)
+            for doc, gold in zip(docs, golds):
+                if (not max_length) or len(doc) < max_length:
+                    yield doc, gold
+
+    @classmethod
+    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
+                   noise_level=0.0):
+        if raw_text is not None:
+            raw_text = add_noise(raw_text, noise_level)
+            return [nlp.make_doc(raw_text)]
+        else:
+            return [Doc(nlp.vocab,
+                        words=add_noise(sent_tuples[1], noise_level))
+                    for (sent_tuples, brackets) in paragraph_tuples]
+
+    @classmethod
+    def _make_golds(cls, docs, paragraph_tuples, make_projective):
+        assert len(docs) == len(paragraph_tuples)
+        if len(docs) == 1:
+            return [GoldParse.from_annot_tuples(docs[0],
+                                                paragraph_tuples[0][0],
+                                                make_projective=make_projective)]
+        else:
+            return [GoldParse.from_annot_tuples(doc, sent_tuples,
+                                                make_projective=make_projective)
+                    for doc, (sent_tuples, brackets)
+                    in zip(docs, paragraph_tuples)]
+

 def add_noise(orig, noise_level):
    if random.random() >= noise_level:
@ -297,11 +267,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
        for filename in loc.iterdir():
            yield from read_json_file(loc / filename, limit=limit)
    else:
-        with loc.open('r', encoding='utf8') as file_:
-            docs = ujson.load(file_)
-        if limit is not None:
-            docs = docs[:limit]
-        for doc in docs:
+        for doc in _json_iterate(loc):
            if docs_filter is not None and not docs_filter(doc):
                continue
            paragraphs = []
@ -331,6 +297,56 @@ def read_json_file(loc, docs_filter=None, limit=None):
                    yield [paragraph.get('raw', None), sents]


+def _json_iterate(loc):
+    # We should've made these files jsonl...But since we didn't, parse out
+    # the docs one-by-one to reduce memory usage.
+    # It's okay to read in the whole file -- just don't parse it into JSON.
+    cdef bytes py_raw
+    loc = util.ensure_path(loc)
+    with loc.open('rb') as file_:
+        py_raw = file_.read()
+    raw = <char*>py_raw
+    cdef int square_depth = 0
+    cdef int curly_depth = 0
+    cdef int inside_string = 0
+    cdef int escape = 0
+    cdef int start = -1
+    cdef char c
+    cdef char quote = ord('"')
+    cdef char backslash = ord('\\')
+    cdef char open_square = ord('[')
+    cdef char close_square = ord(']')
+    cdef char open_curly = ord('{')
+    cdef char close_curly = ord('}')
+    for i in range(len(py_raw)):
+        c = raw[i]
+        if c == backslash:
+            escape = True
+            continue
+        if escape:
+            escape = False
+            continue
+        if c == quote:
+            inside_string = not inside_string
+            continue
+        if inside_string:
+            continue
+        if c == open_square:
+            square_depth += 1
+        elif c == close_square:
+            square_depth -= 1
+        elif c == open_curly:
+            if square_depth == 1 and curly_depth == 0:
+                start = i
+            curly_depth += 1
+        elif c == close_curly:
+            curly_depth -= 1
+            if square_depth == 1 and curly_depth == 0:
+                py_str = py_raw[start : i+1].decode('utf8')
+                yield ujson.loads(py_str)
+                start = -1
+
+
 def iob_to_biluo(tags):
    out = []
    curr_label = None
@ -434,8 +450,21 @@ cdef class GoldParse:
        self.labels = [None] * len(doc)
        self.ner = [None] * len(doc)

-        self.cand_to_gold = align([t.orth_ for t in doc], words)
-        self.gold_to_cand = align(words, [t.orth_ for t in doc])
+        # This needs to be done before we align the words
+        if make_projective and heads is not None and deps is not None:
+            heads, deps = nonproj.projectivize(heads, deps)
+
+        # Do many-to-one alignment for misaligned tokens.
+        # If we over-segment, we'll have one gold word that covers a sequence
+        # of predicted words
+        # If we under-segment, we'll have one predicted word that covers a
+        # sequence of gold words.
+        # If we "mis-segment", we'll have a sequence of predicted words covering
+        # a sequence of gold words. That's many-to-many -- we don't do that.
+        cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
+
+        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
+        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]

        annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
        self.orig_annot = list(zip(*annot_tuples))
@ -443,12 +472,47 @@ cdef class GoldParse:
        for i, gold_i in enumerate(self.cand_to_gold):
            if doc[i].text.isspace():
                self.words[i] = doc[i].text
-                self.tags[i] = 'SP'
+                self.tags[i] = '_SP'
                self.heads[i] = None
                self.labels[i] = None
                self.ner[i] = 'O'
            if gold_i is None:
-                pass
+                if i in i2j_multi:
+                    self.words[i] = words[i2j_multi[i]]
+                    self.tags[i] = tags[i2j_multi[i]]
+                    is_last = i2j_multi[i] != i2j_multi.get(i+1)
+                    is_first = i2j_multi[i] != i2j_multi.get(i-1)
+                    # Set next word in multi-token span as head, until last
+                    if not is_last:
+                        self.heads[i] = i+1
+                        self.labels[i] = 'subtok'
+                    else:
+                        self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
+                        self.labels[i] = deps[i2j_multi[i]]
+                    # Now set NER...This is annoying because if we've split
+                    # got an entity word split into two, we need to adjust the
+                    # BILOU tags. We can't have BB or LL etc.
+                    # Case 1: O -- easy.
+                    ner_tag = entities[i2j_multi[i]]
+                    if ner_tag == 'O':
+                        self.ner[i] = 'O'
+                    # Case 2: U. This has to become a B I* L sequence.
+                    elif ner_tag.startswith('U-'):
+                        if is_first:
+                            self.ner[i] = ner_tag.replace('U-', 'B-', 1)
+                        elif is_last:
+                            self.ner[i] = ner_tag.replace('U-', 'L-', 1)
+                        else:
+                            self.ner[i] = ner_tag.replace('U-', 'I-', 1)
+                    # Case 3: L. If not last, change to I.
+                    elif ner_tag.startswith('L-'):
+                        if is_last:
+                            self.ner[i] = ner_tag
+                        else:
+                            self.ner[i] = ner_tag.replace('L-', 'I-', 1)
+                    # Case 4: I. Stays correct
+                    elif ner_tag.startswith('I-'):
+                        self.ner[i] = ner_tag
            else:
                self.words[i] = words[gold_i]
                self.tags[i] = tags[gold_i]
@ -463,10 +527,6 @@ cdef class GoldParse:
        if cycle is not None:
            raise Exception("Cycle found: %s" % cycle)

-        if make_projective:
-            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
-            self.heads = proj_heads
-
    def __len__(self):
        """Get the number of gold-standard tokens.

--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
 must my myself

 name namely neither never nevertheless next nine no nobody none noone nor not
-nothing now nowhere
+nothing now nowhere n't

 of off often on once one only onto or other others otherwise our ours ourselves
 out over own
@ -66,4 +66,6 @@ whereafter whereas whereby wherein whereupon wherever whether which while
 whither who whoever whole whom whose why will with within without would

 yet you your yours yourself yourselves
+
+'d 'll 'm 're 's 've
 """.split())
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@ -6,17 +6,19 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX

 def noun_chunks(obj):
    doc = obj.doc
-    np_label = doc.vocab.strings['NP']
+    if not len(doc):
+        return
+    np_label = doc.vocab.strings.add('NP')
    left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
    right_labels = ['flat', 'fixed', 'compound', 'neg']
    stop_labels = ['punct']
-    np_left_deps = [doc.vocab.strings[label] for label in left_labels]
-    np_right_deps = [doc.vocab.strings[label] for label in right_labels]
-    stop_deps = [doc.vocab.strings[label] for label in stop_labels]
+    np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
+    np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
+    stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
    token = doc[0]
    while token and token.i < len(doc):
        if token.pos in [PROPN, NOUN, PRON]:
-            left, right = noun_bounds(token)
+            left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
            yield left.i, right.i+1, np_label
            token = right
        token = next_token(token)
@ -33,7 +35,7 @@ def next_token(token):
        return None


-def noun_bounds(root):
+def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
    left_bound = root
    for token in reversed(list(root.lefts)):
        if token.dep in np_left_deps:
@ -41,7 +43,7 @@ def noun_bounds(root):
    right_bound = root
    for token in root.rights:
        if (token.dep in np_right_deps):
-            left, right = noun_bounds(token)
+            left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
            if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
                           doc[left_bound.i: right.i])):
                break
--- a/spacy/lang/fi/examples.py
+++ b/spacy/lang/fi/examples.py
@ -0,0 +1,15 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.fi.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Apple harkitsee ostavansa startup-yrityksen UK:sta 1 miljardilla dollarilla.",
+    "Itseajavat autot siirtävät vakuutusriskin valmistajille.",
+    "San Francisco harkitsee jakelurobottien kieltämistä jalkakäytävillä.",
+    "Lontoo on iso kaupunki Iso-Britanniassa."
+]
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+# import the symbols for the attrs you want to overwrite
+from ...attrs import LIKE_NUM
+
+# check if token resembles a number
+
+_num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'seitsemän', 'kahdeksan', 'yhdeksän', 'kymmenen', 'yksitoista', 'kaksitoista', 'kolmetoista' 'neljätoista', 'viisitoista', 'kuusitoista', 'seitsemäntoista', 'kahdeksantoista', 'yhdeksäntoista', 'kaksikymmentä', 'kolmekymmentä', 'neljäkymmentä', 'viisikymmentä', 'kuusikymmentä'v, 'seitsemänkymmentä', 'kahdeksankymmentä', 'yhdeksänkymmentä', 'sata', 'tuhat', 'miljoona', 'miljardi', 'triljoona']
+
+
+def like_num(text):
+    text = text.replace('.', '').replace(',', '')
+    if text.isdigit():
+        return True
+    if text.count('/') == 1:
+        num, denom = text.split('/')
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+LEX_ATTRS = {
+    LIKE_NUM: like_num
+}
--- a/spacy/lang/fi/stop_words.py
+++ b/spacy/lang/fi/stop_words.py
@ -79,7 +79,7 @@ pienestä pieni pienin poikki puolesta puolestaan päälle

 runsaasti

-saakka sama samaa samaan samalla saman samat samoin sata sataa satojen se
+saakka sama samaa samaan samalla saman samat samoin satojen se
 seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi
 sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun
 sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta
@ -89,7 +89,7 @@ taa taas taemmas tahansa tai takaa takaisin takana takia tallä tapauksessa
 tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä
 teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta
 toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa
-toisesta toista toistaiseksi toki tosin tuhannen tuhat tule tulee tulemme tulen
+toisesta toista toistaiseksi toki tosin tule tulee tulemme tulen
 tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte
 tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi
 tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
    def from_disk(self, path, **exclude):
        return self

+class JapaneseCharacterSegmenter(object):
+    def __init__(self, vocab):
+        self.vocab = vocab
+
+    def __call__(self, text):
+        words = []
+        spaces = []
+        doc = self.tokenizer(text)
+        for token in self.tokenizer(text):
+            words.extend(list(token.text))
+            spaces.extend([False]*len(token.text))
+            spaces[-1] = bool(token.whitespace_)
+        return Doc(self.vocab, words=words, spaces=spaces)
+

 class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'ja'
+    use_janome = True

    @classmethod
    def create_tokenizer(cls, nlp=None):
-        return JapaneseTokenizer(cls, nlp)
+        if cls.use_janome:
+            return JapaneseTokenizer(cls, nlp)
+        else:
+            return JapaneseCharacterSegmenter(cls, nlp.vocab)


 class Japanese(Language):
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
 def is_space(string): return string.isspace()
 def is_title(string): return string.istitle()
 def is_upper(string): return string.isupper()
-def is_stop(string, stops=set()): return string in stops
+def is_stop(string, stops=set()): return string.lower() in stops
 def is_oov(string): return True
 def get_prob(string): return -20.

--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS

 from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -17,6 +18,7 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
+    tag_map = TAG_MAP


 class Polish(Language):
--- a/spacy/lang/pl/tag_map.py
+++ b/spacy/lang/pl/tag_map.py
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -1,7 +1,7 @@
 # encoding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP
+from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN


 _exc = {}
@ -12,24 +12,11 @@ for exc_data in [
    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
-    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
-    {ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
-    {ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
-    {ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
-    {ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
-    {ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
-    {ORTH: "nt.", LEMMA: "na temat", POS: ADP},
-    {ORTH: "ok.", LEMMA: "około"},
-    {ORTH: "n.p.u.", LEMMA: "na psa urok"},
-    {ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
+    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
    _exc[exc_data[ORTH]] = [exc_data]

 for orth in [
-    "w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.",
-    "wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
-    "min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
-    "ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
-    "wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
+    "w.", "r."]:
    _exc[orth] = [{ORTH: orth}]


--- a/spacy/lang/tag_map.py
+++ b/spacy/lang/tag_map.py
@ -24,5 +24,5 @@ TAG_MAP = {
    "ADJ":      {POS: ADJ},
    "VERB":     {POS: VERB},
    "PART":     {POS: PART},
-    "SP":     	{POS: SPACE}
+    "_SP":     	{POS: SPACE}
 }
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LANG
+from ...language import Language
+from ...tokens import Doc
+
+
+class VietnameseDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'vi'  # for pickling
+
+
+class Vietnamese(Language):
+    lang = 'vi'
+    Defaults = VietnameseDefaults  # override defaults
+
+
+__all__ = ['Vietnamese']
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -9,6 +9,7 @@ from ...tokens import Doc
 class ChineseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'zh'  # for pickling
+    use_jieba = True


 class Chinese(Language):
@ -16,14 +17,25 @@ class Chinese(Language):
    Defaults = ChineseDefaults  # override defaults

    def make_doc(self, text):
-        try:
-            import jieba
-        except ImportError:
-            raise ImportError("The Chinese tokenizer requires the Jieba library: "
-                              "https://github.com/fxsjy/jieba")
-        words = list(jieba.cut(text, cut_all=False))
-        words = [x for x in words if x]
-        return Doc(self.vocab, words=words, spaces=[False]*len(words))
+        if self.Defaults.use_jieba:
+            try:
+                import jieba
+            except ImportError:
+                msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
+                       "or install it https://github.com/fxsjy/jieba")
+                raise ImportError(msg)
+            words = list(jieba.cut(text, cut_all=False))
+            words = [x for x in words if x]
+            return Doc(self.vocab, words=words, spaces=[False]*len(words))
+        else:
+            words = []
+            spaces = []
+            doc = self.tokenizer(text)
+            for token in self.tokenizer(text):
+                words.extend(list(token.text))
+                spaces.extend([False]*len(token.text))
+                spaces[-1] = bool(token.whitespace_)
+            return Doc(self.vocab, words=words, spaces=spaces)


 __all__ = ['Chinese']
--- a/spacy/language.py
+++ b/spacy/language.py
@ -17,7 +17,7 @@ from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
 from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
-from .pipeline import merge_noun_chunks, merge_entities
+from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .compat import json_dumps, izip, basestring_
 from .gold import GoldParse
 from .scorer import Scorer
@ -108,7 +108,8 @@ class Language(object):
        'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
        'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
        'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
-        'merge_entities': lambda nlp, **cfg: merge_entities
+        'merge_entities': lambda nlp, **cfg: merge_entities,
+        'merge_subtokens': lambda nlp, **cfg: merge_subtokens,
    }

    def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from .symbols import POS, NOUN, VERB, ADJ, PUNCT
+from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
 from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos


@ -27,11 +27,13 @@ class Lemmatizer(object):
            univ_pos = 'adj'
        elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
            univ_pos = 'punct'
+        elif univ_pos in (PROPN, 'PROPN'):
+            return [string]
        else:
-            return list(set([string.lower()]))
+            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
-            return list(set([string.lower()]))
+            return [string.lower()]
        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
                           self.exc.get(univ_pos, {}),
                           self.rules.get(univ_pos, []))
@ -88,6 +90,7 @@ class Lemmatizer(object):


 def lemmatize(string, index, exceptions, rules):
+    orig = string
    string = string.lower()
    forms = []
    forms.extend(exceptions.get(string, []))
@ -105,5 +108,5 @@ def lemmatize(string, index, exceptions, rules):
    if not forms:
        forms.extend(oov_forms)
    if not forms:
-        forms.append(string)
+        forms.append(orig)
    return list(set(forms))
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,24 +1,19 @@
-# cython: profile=True
 # cython: infer_types=True
-# coding: utf8
+# cython: profile=True
 from __future__ import unicode_literals
-
-import ujson
-from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
-from libcpp.pair cimport pair
+from libc.stdint cimport int32_t, uint64_t, uint16_t
+from preshed.maps cimport PreshMap
+from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
-from libc.stdint cimport int32_t
-
-from .typedefs cimport attr_t
-from .typedefs cimport hash_t
+from .typedefs cimport attr_t, hash_t
 from .structs cimport TokenC
-from .tokens.doc cimport Doc, get_token_attr
+from .lexeme cimport attr_id_t
 from .vocab cimport Vocab
-
+from .tokens.doc cimport Doc
+from .tokens.doc cimport get_token_attr
+from .attrs cimport ID, attr_id_t, NULL_ATTR
 from .attrs import IDS
-from .attrs cimport attr_id_t, ID, NULL_ATTR
 from .attrs import FLAG61 as U_ENT
 from .attrs import FLAG60 as B2_ENT
 from .attrs import FLAG59 as B3_ENT
@ -48,29 +43,24 @@ from .attrs import FLAG36 as L9_ENT
 from .attrs import FLAG35 as L10_ENT


-cpdef enum quantifier_t:
-    _META
-    ONE
+cdef enum action_t:
+    REJECT = 0000
+    MATCH = 1000
+    ADVANCE = 0100
+    RETRY = 0010
+    RETRY_EXTEND = 0011
+    MATCH_EXTEND = 1001
+    MATCH_REJECT = 2000
+
+
+cdef enum quantifier_t:
    ZERO
    ZERO_ONE
    ZERO_PLUS
+    ONE
+    ONE_PLUS


-cdef enum action_t:
-    REJECT
-    ADVANCE
-    REPEAT
-    ACCEPT
-    ADVANCE_ZERO
-    ACCEPT_PREV
-    PANIC
-
-# A "match expression" conists of one or more token patterns
-# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
-# A state is an (int, pattern pointer) pair, where the int is the start
-# position, and the pattern pointer shows where we're up to
-# in the pattern.
-
 cdef struct AttrValueC:
    attr_id_t attr
    attr_t value
@ -80,10 +70,231 @@ cdef struct TokenPatternC:
    AttrValueC* attrs
    int32_t nr_attr
    quantifier_t quantifier
+    hash_t key


-ctypedef TokenPatternC* TokenPatternC_ptr
-ctypedef pair[int, TokenPatternC_ptr] StateC
+cdef struct ActionC:
+    char emit_match
+    char next_state_next_token
+    char next_state_same_token
+    char same_state_next_token
+
+
+cdef struct PatternStateC:
+    TokenPatternC* pattern
+    int32_t start
+    int32_t length
+
+
+cdef struct MatchC:
+    attr_t pattern_id
+    int32_t start
+    int32_t length
+
+
+cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
+    cdef vector[PatternStateC] states
+    cdef vector[MatchC] matches
+    cdef PatternStateC state
+    cdef Pool mem = Pool()
+    # TODO: Prefill this with the extra attribute values.
+    extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
+    # Main loop
+    cdef int i, j
+    for i in range(doc.length):
+        for j in range(n):
+            states.push_back(PatternStateC(patterns[j], i, 0))
+        transition_states(states, matches, &doc.c[i], extra_attrs[i])
+    # Handle matches that end in 0-width patterns
+    finish_states(matches, states)
+    return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
+            for i in range(matches.size())]
+
+
+
+cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
+        const TokenC* token, const attr_t* extra_attrs) except *:
+    cdef int q = 0
+    cdef vector[PatternStateC] new_states
+    for i in range(states.size()):
+        action = get_action(states[i], token, extra_attrs)
+        if action == REJECT:
+            continue
+        state = states[i]
+        states[q] = state
+        while action in (RETRY, RETRY_EXTEND):
+            if action == RETRY_EXTEND:
+                new_states.push_back(
+                    PatternStateC(pattern=state.pattern, start=state.start,
+                                  length=state.length+1))
+            states[q].pattern += 1
+            action = get_action(states[q], token, extra_attrs)
+        if action == REJECT:
+            pass
+        elif action == ADVANCE:
+            states[q].pattern += 1
+            states[q].length += 1
+            q += 1
+        else:
+            ent_id = state.pattern[1].attrs.value
+            if action == MATCH:
+                matches.push_back(
+                    MatchC(pattern_id=ent_id, start=state.start,
+                            length=state.length+1))
+            elif action == MATCH_REJECT:
+                matches.push_back(
+                    MatchC(pattern_id=ent_id, start=state.start,
+                            length=state.length))
+            elif action == MATCH_EXTEND:
+                matches.push_back(
+                    MatchC(pattern_id=ent_id, start=state.start,
+                           length=state.length))
+                states[q].length += 1
+                q += 1
+    states.resize(q)
+    for i in range(new_states.size()):
+        states.push_back(new_states[i])
+
+
+cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
+    '''Handle states that end in zero-width patterns.'''
+    cdef PatternStateC state
+    for i in range(states.size()):
+        state = states[i]
+        while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
+            is_final = get_is_final(state)
+            if is_final:
+                ent_id = state.pattern[1].attrs.value
+                matches.push_back(
+                    MatchC(pattern_id=ent_id, start=state.start, length=state.length))
+                break
+            else:
+                state.pattern += 1
+
+
+cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
+    '''We need to consider:
+
+    a) Does the token match the specification? [Yes, No]
+    b) What's the quantifier? [1, 0+, ?]
+    c) Is this the last specification? [final, non-final]
+
+    We can transition in the following ways:
+
+    a) Do we emit a match?
+    b) Do we add a state with (next state, next token)?
+    c) Do we add a state with (next state, same token)?
+    d) Do we add a state with (same state, next token)?
+
+    We'll code the actions as boolean strings, so 0000 means no to all 4,
+    1000 means match but no states added, etc.
+    
+    1:
+      Yes, final:
+        1000
+      Yes, non-final:
+        0100
+      No, final:
+        0000
+      No, non-final
+        0000
+    0+:
+      Yes, final:
+        1001
+      Yes, non-final:
+        0011
+      No, final:
+        1000 (note: Don't include last token!)
+      No, non-final:
+        0010
+    ?:
+      Yes, final:
+        1000
+      Yes, non-final:
+        0100
+      No, final:
+        1000 (note: Don't include last token!)
+      No, non-final:
+        0010
+
+    Possible combinations:  1000, 0100, 0000, 1001, 0011, 0010, 
+    
+    We'll name the bits "match", "advance", "retry", "extend"
+    REJECT = 0000
+    MATCH = 1000
+    ADVANCE = 0100
+    RETRY = 0010
+    MATCH_EXTEND = 1001
+    RETRY_EXTEND = 0011
+    MATCH_REJECT = 2000 # Match, but don't include last token
+
+    Problem: If a quantifier is matching, we're adding a lot of open partials
+    '''
+    cdef char is_match
+    is_match = get_is_match(state, token, extra_attrs)
+    quantifier = get_quantifier(state)
+    is_final = get_is_final(state)
+    if quantifier == ZERO:
+        is_match = not is_match
+        quantifier = ONE
+    if quantifier == ONE:
+      if is_match and is_final:
+          # Yes, final: 1000
+          return MATCH
+      elif is_match and not is_final:
+          # Yes, non-final: 0100
+          return ADVANCE
+      elif not is_match and is_final:
+          # No, final: 0000
+          return REJECT
+      else:
+          return REJECT
+    elif quantifier == ZERO_PLUS:
+      if is_match and is_final:
+          # Yes, final: 1001
+          return MATCH_EXTEND
+      elif is_match and not is_final:
+          # Yes, non-final: 0011
+          return RETRY_EXTEND
+      elif not is_match and is_final:
+          # No, final 2000 (note: Don't include last token!)
+          return MATCH_REJECT
+      else:
+          # No, non-final 0010
+          return RETRY
+    elif quantifier == ZERO_ONE:
+      if is_match and is_final:
+          # Yes, final: 1000
+          return MATCH
+      elif is_match and not is_final:
+          # Yes, non-final: 0100
+          return ADVANCE
+      elif not is_match and is_final:
+          # No, final 2000 (note: Don't include last token!)
+          return MATCH_REJECT
+      else:
+          # No, non-final 0010
+          return RETRY
+
+
+cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
+    spec = state.pattern
+    for attr in spec.attrs[:spec.nr_attr]:
+        if get_token_attr(token, attr.attr) != attr.value:
+            return 0
+    else:
+        return 1
+
+
+cdef char get_is_final(PatternStateC state) nogil:
+    if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
+        return 1
+    else:
+        return 0
+
+
+cdef char get_quantifier(PatternStateC state) nogil:
+    return state.pattern.quantifier


 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -97,6 +308,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
        for j, (attr, value) in enumerate(spec):
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
+        pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
    i = len(token_specs)
    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
    pattern[i].attrs[0].attr = ID
@ -105,48 +317,16 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
    return pattern


-cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
+cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
    while pattern.nr_attr != 0:
        pattern += 1
    id_attr = pattern[0].attrs[0]
-    assert id_attr.attr == ID
    return id_attr.value

-
-cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
-    lookahead = &pattern[1]
-    for attr in pattern.attrs[:pattern.nr_attr]:
-        if get_token_attr(token, attr.attr) != attr.value:
-            if pattern.quantifier == ONE:
-                return REJECT
-            elif pattern.quantifier == ZERO:
-                return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
-            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
-                return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
-            else:
-                return PANIC
-    if pattern.quantifier == ZERO:
-        return REJECT
-    elif lookahead.nr_attr == 0:
-        return ACCEPT
-    elif pattern.quantifier in (ONE, ZERO_ONE):
-        return ADVANCE
-    elif pattern.quantifier == ZERO_PLUS:
-        # This is a bandaid over the 'shadowing' problem described here:
-        # https://github.com/explosion/spaCy/issues/864
-        next_action = get_action(lookahead, token)
-        if next_action is REJECT:
-            return REPEAT
-        else:
-            return ADVANCE_ZERO
-    else:
-        return PANIC
-
-
 def _convert_strings(token_specs, string_store):
    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
-    operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
-                 '?': (ZERO_ONE,), '1': (ONE,)}
+    operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
+                 '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
    tokens = []
    op = ONE
    for spec in token_specs:
@ -176,21 +356,6 @@ def _convert_strings(token_specs, string_store):
    return tokens


-def merge_phrase(matcher, doc, i, matches):
-    """Callback to merge a phrase on match."""
-    ent_id, label, start, end = matches[i]
-    span = doc[start:end]
-    span.merge(ent_type=label, ent_id=ent_id)
-
-
-def unpickle_matcher(vocab, patterns, callbacks):
-    matcher = Matcher(vocab)
-    for key, specs in patterns.items():
-        callback = callbacks.get(key, None)
-        matcher.add(key, callback, *specs)
-    return matcher
-
-
 cdef class Matcher:
    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
@ -311,7 +476,7 @@ cdef class Matcher:
        if key not in self._patterns:
            return default
        return (self._callbacks[key], self._patterns[key])
-
+    
    def pipe(self, docs, batch_size=1000, n_threads=2):
        """Match a stream of documents, yielding them in turn.

@ -333,85 +498,9 @@ cdef class Matcher:
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
-        cdef vector[StateC] partials
-        cdef int n_partials = 0
-        cdef int q = 0
-        cdef int i, token_i
-        cdef const TokenC* token
-        cdef StateC state
-        matches = []
-        for token_i in range(doc.length):
-            token = &doc.c[token_i]
-            q = 0
-            # Go over the open matches, extending or finalizing if able.
-            # Otherwise, we over-write them (q doesn't advance)
-            for state in partials:
-                action = get_action(state.second, token)
-                if action == PANIC:
-                    raise Exception("Error selecting action in matcher")
-                while action == ADVANCE_ZERO:
-                    state.second += 1
-                    action = get_action(state.second, token)
-                if action == PANIC:
-                    raise Exception("Error selecting action in matcher")
-
-                if action == REPEAT:
-                    # Leave the state in the queue, and advance to next slot
-                    # (i.e. we don't overwrite -- we want to greedily match
-                    # more pattern.
-                    q += 1
-                elif action == REJECT:
-                    pass
-                elif action == ADVANCE:
-                    partials[q] = state
-                    partials[q].second += 1
-                    q += 1
-                elif action in (ACCEPT, ACCEPT_PREV):
-                    # TODO: What to do about patterns starting with ZERO? Need
-                    # to adjust the start position.
-                    start = state.first
-                    end = token_i+1 if action == ACCEPT else token_i
-                    ent_id = state.second[1].attrs[0].value
-                    label = state.second[1].attrs[1].value
-                    matches.append((ent_id, start, end))
-
-            partials.resize(q)
-            # Check whether we open any new patterns on this token
-            for pattern in self.patterns:
-                action = get_action(pattern, token)
-                if action == PANIC:
-                    raise Exception("Error selecting action in matcher")
-                while action == ADVANCE_ZERO:
-                    pattern += 1
-                    action = get_action(pattern, token)
-                if action == REPEAT:
-                    state.first = token_i
-                    state.second = pattern
-                    partials.push_back(state)
-                elif action == ADVANCE:
-                    # TODO: What to do about patterns starting with ZERO? Need
-                    # to adjust the start position.
-                    state.first = token_i
-                    state.second = pattern + 1
-                    partials.push_back(state)
-                elif action in (ACCEPT, ACCEPT_PREV):
-                    start = token_i
-                    end = token_i+1 if action == ACCEPT else token_i
-                    ent_id = pattern[1].attrs[0].value
-                    label = pattern[1].attrs[1].value
-                    matches.append((ent_id, start, end))
-        # Look for open patterns that are actually satisfied
-        for state in partials:
-            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
-                state.second += 1
-                if state.second.nr_attr == 0:
-                    start = state.first
-                    end = len(doc)
-                    ent_id = state.second.attrs[0].value
-                    label = state.second.attrs[0].value
-                    matches.append((ent_id, start, end))
-        for i, (ent_id, start, end) in enumerate(matches):
-            on_match = self._callbacks.get(ent_id)
+        matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
+        for i, (key, start, end) in enumerate(matches):
+            on_match = self._callbacks.get(key, None)
            if on_match is not None:
                on_match(self, doc, i, matches)
        return matches
@ -423,31 +512,37 @@ cdef class Matcher:
            return key


+def unpickle_matcher(vocab, patterns, callbacks):
+    matcher = Matcher(vocab)
+    for key, specs in patterns.items():
+        callback = callbacks.get(key, None)
+        matcher.add(key, callback, *specs)
+    return matcher
+
+
+def _get_longest_matches(matches):
+    '''Filter out matches that have a longer equivalent.'''
+    longest_matches = {}
+    for pattern_id, start, end in matches:
+        key = (pattern_id, start)
+        length = end-start
+        if key not in longest_matches or length > longest_matches[key]:
+            longest_matches[key] = length
+    return [(pattern_id, start, start+length)
+              for (pattern_id, start), length in longest_matches.items()]
+
+
 def get_bilou(length):
-    if length == 1:
+    if length == 0:
+        raise ValueError("Length must be >= 1")
+    elif length == 1:
        return [U_ENT]
    elif length == 2:
        return [B2_ENT, L2_ENT]
    elif length == 3:
        return [B3_ENT, I3_ENT, L3_ENT]
-    elif length == 4:
-        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
-    elif length == 5:
-        return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
-    elif length == 6:
-        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
-    elif length == 7:
-        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
-    elif length == 8:
-        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
-    elif length == 9:
-        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
-                L9_ENT]
-    elif length == 10:
-        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
-                I10_ENT, I10_ENT, L10_ENT]
    else:
-        raise ValueError("Max length currently 10 for phrase matching")
+        return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]


 cdef class PhraseMatcher:
@ -456,21 +551,21 @@ cdef class PhraseMatcher:
    cdef Matcher matcher
    cdef PreshMap phrase_ids
    cdef int max_length
-    cdef attr_t* _phrase_key
    cdef public object _callbacks
    cdef public object _patterns

    def __init__(self, Vocab vocab, max_length=10):
        self.mem = Pool()
-        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
        self.max_length = max_length
        self.vocab = vocab
        self.matcher = Matcher(self.vocab)
        self.phrase_ids = PreshMap()
-        abstract_patterns = []
-        for length in range(1, max_length):
-            abstract_patterns.append([{tag: True}
-                                      for tag in get_bilou(length)])
+        abstract_patterns = [
+            [{U_ENT: True}],
+            [{B2_ENT: True}, {L2_ENT: True}],
+            [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
+            [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
+        ]
        self.matcher.add('Candidate', None, *abstract_patterns)
        self._callbacks = {}

@ -504,29 +599,24 @@ cdef class PhraseMatcher:
        *docs (Doc): `Doc` objects representing match patterns.
        """
        cdef Doc doc
-        for doc in docs:
-            if len(doc) >= self.max_length:
-                msg = (
-                    "Pattern length (%d) >= phrase_matcher.max_length (%d). "
-                    "Length can be set on initialization, up to 10."
-                )
-                raise ValueError(msg % (len(doc), self.max_length))
        cdef hash_t ent_id = self.matcher._normalize_key(key)
        self._callbacks[ent_id] = on_match
        cdef int length
        cdef int i
        cdef hash_t phrase_hash
+        cdef Pool mem = Pool()
        for doc in docs:
            length = doc.length
+            if length == 0:
+                continue
            tags = get_bilou(length)
-            for i in range(self.max_length):
-                self._phrase_key[i] = 0
+            phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
            for i, tag in enumerate(tags):
                lexeme = self.vocab[doc.c[i].lex.orth]
                lexeme.set_flag(tag, True)
-                self._phrase_key[i] = lexeme.orth
-            phrase_hash = hash64(self._phrase_key,
-                                 self.max_length * sizeof(attr_t), 0)
+                phrase_key[i] = lexeme.orth
+            phrase_hash = hash64(phrase_key,
+                                 length * sizeof(attr_t), 0)
            self.phrase_ids.set(phrase_hash, <void*>ent_id)

    def __call__(self, Doc doc):
@ -548,28 +638,45 @@ cdef class PhraseMatcher:
                on_match(self, doc, i, matches)
        return matches

-    def pipe(self, stream, batch_size=1000, n_threads=2):
+    def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
+             as_tuples=False):
        """Match a stream of documents, yielding them in turn.

        docs (iterable): A stream of documents.
        batch_size (int): Number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
            in parallel, if the implementation supports multi-threading.
+        return_matches (bool): Yield the match lists along with the docs, making
+            results (doc, matches) tuples.
+        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
+            and yield (result, context) tuples out.
+            If both return_matches and as_tuples are True, the output will
+            be a sequence of ((doc, matches), context) tuples.
        YIELDS (Doc): Documents, in order.
        """
-        for doc in stream:
-            self(doc)
-            yield doc
+        if as_tuples:
+            for doc, context in stream:
+                matches = self(doc)
+                if return_matches:
+                    yield ((doc, matches), context)
+                else:
+                    yield (doc, context)
+        else:
+            for doc in stream:
+                matches = self(doc)
+                if return_matches:
+                    yield (doc, matches) 
+                else:
+                    yield doc

    def accept_match(self, Doc doc, int start, int end):
-        assert (end - start) < self.max_length
        cdef int i, j
-        for i in range(self.max_length):
-            self._phrase_key[i] = 0
+        cdef Pool mem = Pool()
+        phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
        for i, j in enumerate(range(start, end)):
-            self._phrase_key[i] = doc.c[j].lex.orth
-        cdef hash_t key = hash64(self._phrase_key,
-                                 self.max_length * sizeof(attr_t), 0)
+            phrase_key[i] = doc.c[j].lex.orth
+        cdef hash_t key = hash64(phrase_key,
+                                 (end-start) * sizeof(attr_t), 0)
        ent_id = <hash_t>self.phrase_ids.get(key)
        if ent_id == 0:
            return None
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -47,7 +47,9 @@ cdef class Morphology:
 cdef enum univ_morph_t:
    NIL = 0
    Animacy_anim = symbols.Animacy_anim
-    Animacy_inam
+    Animacy_inan
+    Animacy_hum
+    Animacy_nhum
    Aspect_freq
    Aspect_imp
    Aspect_mod
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -184,7 +184,9 @@ cdef class Morphology:

 IDS = {
    "Animacy_anim": Animacy_anim,
-    "Animacy_inam": Animacy_inam,
+    "Animacy_inan": Animacy_inan,
+    "Animacy_hum": Animacy_hum, # U20
+    "Animacy_nhum": Animacy_nhum,
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -25,6 +25,7 @@ from .morphology cimport Morphology
 from .vocab cimport Vocab
 from .syntax import nonproj
 from .compat import json_dumps
+from .matcher import Matcher

 from .attrs import POS
 from .parts_of_speech import X
@ -97,6 +98,17 @@ def merge_entities(doc):
    return doc


+def merge_subtokens(doc, label='subtok'):
+    merger = Matcher(doc.vocab)
+    merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
+    matches = merger(doc)
+    spans = [doc[start:end+1] for _, start, end in matches]
+    offsets = [(span.start_char, span.end_char) for span in spans]
+    for start_char, end_char in offsets:
+        doc.merge(start_char, end_char)
+    return doc
+ 
+
 class Pipe(object):
    """This class is not instantiated directly. Components inherit from it, and
    it defines the interface that components should follow to function as
@ -167,7 +179,7 @@ class Pipe(object):
        problem.
        """
        raise NotImplementedError
-    
+
    def create_optimizer(self):
        return create_default_optimizer(self.model.ops,
                                        **self.cfg.get('optimizer', {}))
@ -652,11 +664,13 @@ class MultitaskObjective(Tagger):
            self.make_label = self.make_dep_tag_offset
        elif target == 'ent_tag':
            self.make_label = self.make_ent_tag
+        elif target == 'sent_start':
+            self.make_label = self.make_sent_start
        elif hasattr(target, '__call__'):
            self.make_label = target
        else:
            raise ValueError("MultitaskObjective target should be function or "
-                             "one of: dep, tag, ent, dep_tag_offset, ent_tag.")
+                             "one of: dep, tag, ent, sent_start, dep_tag_offset, ent_tag.")
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)
        self.cfg.setdefault('pretrained_dims',
@ -716,11 +730,7 @@ class MultitaskObjective(Tagger):
        for i, gold in enumerate(golds):
            for j in range(len(docs[i])):
                # Handes alignment for tokenization differences
-                gold_idx = gold.cand_to_gold[j]
-                if gold_idx is None:
-                    idx += 1
-                    continue
-                label = self.make_label(gold_idx, gold.words, gold.tags,
+                label = self.make_label(j, gold.words, gold.tags,
                                        gold.heads, gold.labels, gold.ents)
                if label is None or label not in self.labels:
                    correct[idx] = guesses[idx]
@ -765,6 +775,51 @@ class MultitaskObjective(Tagger):
        else:
            return '%s-%s' % (tags[i], ents[i])

+    @staticmethod
+    def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
+        '''A multi-task objective for representing sentence boundaries,
+        using BILU scheme. (O is impossible)
+
+        The implementation of this method uses an internal cache that relies
+        on the identity of the heads array, to avoid requiring a new piece
+        of gold data. You can pass cache=False if you know the cache will
+        do the wrong thing.
+        '''
+        assert len(words) == len(heads)
+        assert target < len(words), (target, len(words))
+        if cache:
+            if id(heads) in _cache:
+                return _cache[id(heads)][target]
+            else:
+                for key in list(_cache.keys()):
+                    _cache.pop(key)
+            sent_tags = ['I-SENT'] * len(words)
+            _cache[id(heads)] = sent_tags
+        else:
+            sent_tags = ['I-SENT'] * len(words)
+
+        def _find_root(child):
+            seen = set([child])
+            while child is not None and heads[child] != child:
+                seen.add(child)
+                child = heads[child]
+            return child
+
+        sentences = {}
+        for i in range(len(words)):
+            root = _find_root(i)
+            if root is None:
+                sent_tags[i] = None
+            else:
+                sentences.setdefault(root, []).append(i)
+        for root, span in sorted(sentences.items()):
+            if len(span) == 1:
+                sent_tags[span[0]] = 'U-SENT'
+            else:
+                sent_tags[span[0]] = 'B-SENT'
+                sent_tags[span[-1]] = 'L-SENT'
+        return sent_tags[target]
+

 class SimilarityHook(Pipe):
    """
@ -823,8 +878,8 @@ class TextCategorizer(Pipe):
    name = 'textcat'

    @classmethod
-    def Model(cls, nr_class=1, width=64, **cfg):
-        return build_text_classifier(nr_class, width, **cfg)
+    def Model(cls, **cfg):
+        return build_text_classifier(**cfg)

    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
@ -890,6 +945,15 @@ class TextCategorizer(Pipe):
        if label in self.labels:
            return 0
        if self.model not in (None, True, False):
+            # This functionality was available previously, but was broken.
+            # The problem is that we resize the last layer, but the last layer
+            # is actually just an ensemble. We're not resizing the child layers
+            # -- a huge problem.
+            raise ValueError(
+                "Cannot currently add labels to pre-trained text classifier. "
+                "Add labels before training begins. This functionality was "
+                "available in previous versions, but had significant bugs that "
+                "let to poor performance")
            smaller = self.model._layers[-1]
            larger = Affine(len(self.labels)+1, smaller.nI)
            copy_array(larger.W[:smaller.nO], smaller.W)
@ -905,8 +969,9 @@ class TextCategorizer(Pipe):
            token_vector_width = 64
        if self.model is True:
            self.cfg['pretrained_dims'] = self.vocab.vectors_length
-            self.model = self.Model(len(self.labels), token_vector_width,
-                                    **self.cfg)
+            self.cfg['nr_class'] = len(self.labels)
+            self.cfg['width'] = token_vector_width
+            self.model = self.Model(**self.cfg)
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
@ -920,7 +985,7 @@ cdef class DependencyParser(Parser):
    @property
    def postprocesses(self):
        return [nonproj.deprojectivize]
-    
+
    def add_multitask_objective(self, target):
        labeller = MultitaskObjective(self.vocab, target=target)
        self._multitasks.append(labeller)
@ -941,7 +1006,7 @@ cdef class EntityRecognizer(Parser):
    TransitionSystem = BiluoPushDown

    nr_feature = 6
-    
+
    def add_multitask_objective(self, target):
        labeller = MultitaskObjective(self.vocab, target=target)
        self._multitasks.append(labeller)
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import division, print_function, unicode_literals

-from .gold import tags_to_entities
+from .gold import tags_to_entities, GoldParse


 class PRFScore(object):
@ -84,6 +84,8 @@ class Scorer(object):
        }

    def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
+        if len(tokens) != len(gold):
+            gold = GoldParse.from_annot_tuples(tokens, zip(*gold.orig_annot))
        assert len(tokens) == len(gold)
        gold_deps = set()
        gold_tags = set()
@ -100,8 +102,7 @@ class Scorer(object):
                continue
            gold_i = gold.cand_to_gold[token.i]
            if gold_i is None:
-                if token.dep_.lower() not in punct_labels:
-                    self.tokens.fp += 1
+                self.tokens.fp += 1
            else:
                self.tokens.tp += 1
                cand_tags.add((gold_i, token.tag_))
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -85,6 +85,7 @@ cdef enum symbol_t:
    SENT_START
    SPACY
    PROB
+    LANG

    ADJ
    ADP
@ -108,8 +109,9 @@ cdef enum symbol_t:
    SPACE

    Animacy_anim
-    Animacy_inam
+    Animacy_inan
    Animacy_hum # U20
+    Animacy_nhum
    Aspect_freq
    Aspect_imp
    Aspect_mod
@ -393,6 +395,7 @@ cdef enum symbol_t:
    EVENT
    WORK_OF_ART
    LANGUAGE
+    LAW

    DATE
    TIME
@ -451,10 +454,9 @@ cdef enum symbol_t:
    prt
    punct
    quantmod
+    relcl
    rcmod
    root
    xcomp

    acl
-    LAW
-    LANG
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -114,8 +114,9 @@ IDS = {
    "SPACE": SPACE,

    "Animacy_anim": Animacy_anim,
-    "Animacy_inam": Animacy_inam,
+    "Animacy_inam": Animacy_inan,
    "Animacy_hum": Animacy_hum, # U20
+    "Animacy_nhum": Animacy_nhum,
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
@ -458,6 +459,7 @@ IDS = {
    "punct": punct,
    "quantmod": quantmod,
    "rcmod": rcmod,
+    "relcl": relcl,
    "root": root,
    "xcomp": xcomp,

--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -108,7 +108,7 @@ cdef cppclass StateC:
            ids[1] = this.B(1)
            ids[2] = this.S(0)
            ids[3] = this.S(1)
-            ids[4] = this.H(this.S(0))
+            ids[4] = this.S(2)
            ids[5] = this.L(this.B(0), 1)
            ids[6] = this.L(this.S(0), 1)
            ids[7] = this.R(this.S(0), 1)
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -6,16 +6,19 @@ from __future__ import unicode_literals

 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict, Counter
 from thinc.extra.search cimport Beam
+import json

 from .stateclass cimport StateClass
 from ._state cimport StateC
-from .nonproj import is_nonproj_tree
+from . import nonproj
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse, GoldParseC
 from ..structs cimport TokenC

+# Calculate cost as gold/not gold. We don't use scalar value anyway.
+cdef int BINARY_COSTS = 1

 DEF NON_MONOTONIC = True
 DEF USE_BREAK = True
@ -54,6 +57,8 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
            cost += 1
        if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
            cost += 1
+        if BINARY_COSTS and cost >= 1:
+            return cost
    cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
    return cost

@ -67,6 +72,8 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
        cost += gold.heads[target] == B_i
        if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
            break
+        if BINARY_COSTS and cost >= 1:
+            return cost
    if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
        cost += 1
    return cost
@ -110,7 +117,8 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
 cdef class Shift:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1
+        sent_start = st._sent[st.B_(0).l_edge].sent_start
+        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -170,7 +178,8 @@ cdef class Reduce:
 cdef class LeftArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return st.B_(0).sent_start != 1
+        sent_start = st._sent[st.B_(0).l_edge].sent_start
+        return sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -205,7 +214,8 @@ cdef class RightArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        # If there's (perhaps partial) parse pre-set, don't allow cycle.
-        return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
+        sent_start = st._sent[st.B_(0).l_edge].sent_start
+        return sent_start != 1 and st.H(st.S(0)) != st.B(0)

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -312,39 +322,42 @@ cdef class ArcEager(TransitionSystem):

    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions', OrderedDict((
-            (SHIFT, ['']),
-            (REDUCE, ['']),
-            (RIGHT, []),
-            (LEFT, []),
-            (BREAK, ['ROOT']))
-        ))
-        seen_actions = set()
+        min_freq = kwargs.get('min_freq', None)
+        actions = defaultdict(lambda: Counter())
+        actions[SHIFT][''] = 1
+        actions[REDUCE][''] = 1
        for label in kwargs.get('left_labels', []):
-            if label.upper() != 'ROOT':
-                if (LEFT, label) not in seen_actions:
-                    actions[LEFT].append(label)
-                    seen_actions.add((LEFT, label))
+            actions[LEFT][label] = 1
+            actions[SHIFT][label] = 1
        for label in kwargs.get('right_labels', []):
-            if label.upper() != 'ROOT':
-                if (RIGHT, label) not in seen_actions:
-                    actions[RIGHT].append(label)
-                    seen_actions.add((RIGHT, label))
-
+            actions[RIGHT][label] = 1
+            actions[REDUCE][label] = 1
        for raw_text, sents in kwargs.get('gold_parses', []):
            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                heads, labels = nonproj.projectivize(heads, labels)
                for child, head, label in zip(ids, heads, labels):
-                    if label.upper() == 'ROOT':
+                    if label.upper() == 'ROOT' :
                        label = 'ROOT'
-                    if label != 'ROOT':
-                        if head < child:
-                            if (RIGHT, label) not in seen_actions:
-                                actions[RIGHT].append(label)
-                                seen_actions.add((RIGHT, label))
-                        elif head > child:
-                            if (LEFT, label) not in seen_actions:
-                                actions[LEFT].append(label)
-                                seen_actions.add((LEFT, label))
+                    if head == child:
+                        actions[BREAK][label] += 1
+                    elif head < child:
+                        actions[RIGHT][label] += 1
+                        actions[REDUCE][''] += 1
+                    elif head > child:
+                        actions[LEFT][label] += 1
+                        actions[SHIFT][''] += 1
+        if min_freq is not None:
+            for action, label_freqs in actions.items():
+                for label, freq in list(label_freqs.items()):
+                    if freq < min_freq:
+                        label_freqs.pop(label)
+        # Ensure these actions are present
+        actions[BREAK].setdefault('ROOT', 0)
+        actions[RIGHT].setdefault('subtok', 0)
+        actions[LEFT].setdefault('subtok', 0)
+        # Used for backoff
+        actions[RIGHT].setdefault('dep', 0)
+        actions[LEFT].setdefault('dep', 0)
        return actions

    property action_types:
@ -376,18 +389,34 @@ cdef class ArcEager(TransitionSystem):
    def preprocess_gold(self, GoldParse gold):
        if not self.has_gold(gold):
            return None
-        for i in range(gold.length):
+        for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)):
            # Missing values
-            if gold.heads[i] is None or gold.labels[i] is None:
+            if head is None or dep is None:
                gold.c.heads[i] = i
                gold.c.has_dep[i] = False
            else:
-                label = gold.labels[i]
+                if head > i:
+                    action = LEFT
+                elif head < i:
+                    action = RIGHT
+                else:
+                    action = BREAK
+                if dep not in self.labels[action]:
+                    if action == BREAK:
+                        dep = 'ROOT'
+                    elif nonproj.is_decorated(dep):
+                        backoff = nonproj.decompose(dep)[0]
+                        if backoff in self.labels[action]:
+                            dep = backoff
+                        else:
+                            dep = 'dep'
+                    else:
+                        dep = 'dep'
                gold.c.has_dep[i] = True
-                if label.upper() == 'ROOT':
-                    label = 'ROOT'
-                gold.c.heads[i] = gold.heads[i]
-                gold.c.labels[i] = self.strings.add(label)
+                if dep.upper() == 'ROOT':
+                    dep = 'ROOT'
+                gold.c.heads[i] = head
+                gold.c.labels[i] = self.strings.add(dep)
        return gold

    def get_beam_parses(self, Beam beam):
@ -527,8 +556,13 @@ cdef class ArcEager(TransitionSystem):
                is_valid[i] = False
                costs[i] = 9000
        if n_gold < 1:
-            # Check projectivity --- leading cause
-            if is_nonproj_tree(gold.heads):
+            # Check label set --- leading cause
+            label_set = set([self.strings[self.c[i].label] for i in range(self.n_moves)])
+            for label_str in gold.labels:
+                if label_str is not None and label_str not in label_set:
+                    raise ValueError("Cannot get gold parser action: unknown label: %s" % label_str)
+            # Check projectivity --- other leading cause
+            if nonproj.is_nonproj_tree(gold.heads):
                raise ValueError(
                    "Could not find a gold-standard action to supervise the "
                    "dependency parser. Likely cause: the tree is "
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -3,7 +3,7 @@ from __future__ import unicode_literals

 from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
-from collections import OrderedDict
+from collections import OrderedDict, Counter

 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -64,21 +64,18 @@ cdef class BiluoPushDown(TransitionSystem):

    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions', OrderedDict((
-            (MISSING, ['']),
-            (BEGIN, []),
-            (IN, []),
-            (LAST, []),
-            (UNIT, []),
-            (OUT, [''])
-        )))
-        seen_entities = set()
+        actions = {
+            MISSING: Counter(),
+            BEGIN: Counter(),
+            IN: Counter(),
+            LAST: Counter(),
+            UNIT: Counter(),
+            OUT: Counter()
+        }
+        actions[OUT][''] = 1
        for entity_type in kwargs.get('entity_types', []):
-            if entity_type in seen_entities:
-                continue
-            seen_entities.add(entity_type)
            for action in (BEGIN, IN, LAST, UNIT):
-                actions[action].append(entity_type)
+                actions[action][entity_type] = 1
        moves = ('M', 'B', 'I', 'L', 'U')
        for raw_text, sents in kwargs.get('gold_parses', []):
            for (ids, words, tags, heads, labels, biluo), _ in sents:
@ -87,10 +84,8 @@ cdef class BiluoPushDown(TransitionSystem):
                        if ner_tag.count('-') != 1:
                            raise ValueError(ner_tag)
                        _, label = ner_tag.split('-')
-                        if label not in seen_entities:
-                            seen_entities.add(label)
-                            for move_str in ('B', 'I', 'L', 'U'):
-                                actions[moves.index(move_str)].append(label)
+                        for action in (BEGIN, IN, LAST, UNIT):
+                            actions[action][label] += 1
        return actions

    property action_types:
@ -213,7 +208,7 @@ cdef class BiluoPushDown(TransitionSystem):
            raise Exception(move)
        return t

-    def add_action(self, int action, label_name):
+    def add_action(self, int action, label_name, freq=None):
        cdef attr_t label_id
        if not isinstance(label_name, (int, long)):
            label_id = self.strings.add(label_name)
@ -234,6 +229,12 @@ cdef class BiluoPushDown(TransitionSystem):
        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
+        if self.labels.get(action, []):
+            freq = min(0, min(self.labels[action].values()))
+            self.labels[action][label_name] = freq-1
+        else:
+            self.labels[action] = Counter()
+            self.labels[action][label_name] = -1
        return 1

    cdef int initialize_state(self, StateC* st) nogil:
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -15,7 +15,7 @@ cdef class Parser:
    cdef readonly object cfg
    cdef public object _multitasks

-    cdef void _parseC(self, StateC* state, 
+    cdef void _parseC(self, StateC** states, int nr_task, 
            const float* feat_weights, const float* bias,
            const float* hW, const float* hb,
            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -1,7 +1,6 @@
 # cython: infer_types=True
 # cython: cdivision=True
 # cython: boundscheck=False
-# cython: profile=True
 # coding: utf-8
 from __future__ import unicode_literals, print_function

@ -28,6 +27,8 @@ from thinc.misc import LayerNorm
 from thinc.neural.ops import CupyOps
 from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
+from thinc cimport openblas
+

 from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
 from .._ml import link_vectors_to_models, create_default_optimizer
@ -266,7 +267,7 @@ cdef class Parser:

        with Model.use_device('cpu'):
            upper = chain(
-                clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1),
+                clone(Maxout(hidden_width, hidden_width), depth-1),
                zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
            )

@ -302,7 +303,7 @@ cdef class Parser:
        """
        self.vocab = vocab
        if moves is True:
-            self.moves = self.TransitionSystem(self.vocab.strings, {})
+            self.moves = self.TransitionSystem(self.vocab.strings)
        else:
            self.moves = moves
        if 'beam_width' not in cfg:
@ -311,12 +312,7 @@ cdef class Parser:
            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
        if 'pretrained_dims' not in cfg:
            cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
-        cfg.setdefault('cnn_maxout_pieces', 3)
        self.cfg = cfg
-        if 'actions' in self.cfg:
-            for action, labels in self.cfg.get('actions', {}).items():
-                for label in labels:
-                    self.moves.add_action(action, label)
        self.model = model
        self._multitasks = []

@ -423,69 +419,81 @@ cdef class Parser:
        cdef int nr_hidden = hidden_weights.shape[0]
        cdef int nr_task = states.size()
        with nogil:
-            for i in range(nr_task):
-                self._parseC(states[i],
-                    feat_weights, bias, hW, hb,
-                    nr_class, nr_hidden, nr_feat, nr_piece)
+            self._parseC(&states[0], nr_task, feat_weights, bias, hW, hb,
+                nr_class, nr_hidden, nr_feat, nr_piece)
        PyErr_CheckSignals()
        tokvecs = self.model[0].ops.unflatten(tokvecs,
                                    [len(doc) for doc in docs])
        return state_objs, tokvecs

-    cdef void _parseC(self, StateC* state, 
+    cdef void _parseC(self, StateC** states, int nr_task, 
            const float* feat_weights, const float* bias,
            const float* hW, const float* hb,
            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
        token_ids = <int*>calloc(nr_feat, sizeof(int))
        is_valid = <int*>calloc(nr_class, sizeof(int))
-        vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
-        scores = <float*>calloc(nr_class, sizeof(float))
+        vectors = <float*>calloc(nr_hidden * nr_task, sizeof(float))
+        unmaxed = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
+        scores = <float*>calloc(nr_class*nr_task, sizeof(float))
        if not (token_ids and is_valid and vectors and scores):
            with gil:
                PyErr_SetFromErrno(MemoryError)
                PyErr_CheckSignals()
-        cdef float feature
-        while not state.is_final():
-            state.set_context_tokens(token_ids, nr_feat)
-            memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
-            memset(scores, 0, nr_class * sizeof(float))
-            sum_state_features(vectors,
-                feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
-            for i in range(nr_hidden * nr_piece):
-                vectors[i] += bias[i]
-            V = vectors
-            W = hW
-            for i in range(nr_hidden):
-                if nr_piece == 1:
-                    feature = V[0] if V[0] >= 0. else 0.
-                elif nr_piece == 2:
-                    feature = V[0] if V[0] >= V[1] else V[1]
-                else:
-                    feature = Vec.max(V, nr_piece)
-                for j in range(nr_class):
-                    scores[j] += feature * W[j]
-                W += nr_class
-                V += nr_piece
-            for i in range(nr_class):
-                scores[i] += hb[i]
-            self.moves.set_valid(is_valid, state)
-            guess = arg_max_if_valid(scores, is_valid, nr_class)
-            action = self.moves.c[guess]
-            action.do(state, action.label)
-            state.push_hist(guess)
+        cdef int nr_todo = nr_task
+        cdef int i, j
+        cdef vector[StateC*] unfinished
+        while nr_todo >= 1:
+            memset(vectors, 0, nr_todo * nr_hidden * sizeof(float))
+            memset(scores, 0, nr_todo * nr_class * sizeof(float))
+            for i in range(nr_todo):
+                state = states[i]
+                state.set_context_tokens(token_ids, nr_feat)
+                memset(unmaxed, 0, nr_hidden * nr_piece * sizeof(float))
+                sum_state_features(unmaxed,
+                    feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
+                VecVec.add_i(unmaxed,
+                    bias, 1., nr_hidden*nr_piece)
+                state_vector = &vectors[i*nr_hidden]
+                for j in range(nr_hidden):
+                    index = j * nr_piece
+                    which = Vec.arg_max(&unmaxed[index], nr_piece)
+                    state_vector[j] = unmaxed[index + which]
+            # Compute hidden-to-output
+            openblas.simple_gemm(scores, nr_todo, nr_class,
+                vectors, nr_todo, nr_hidden, hW, nr_hidden, nr_class, 0, 0)
+            # Add bias
+            for i in range(nr_todo):
+                VecVec.add_i(&scores[i*nr_class],
+                    hb, 1., nr_class)
+            # Validate actions, argmax, take action.
+            for i in range(nr_todo):
+                state = states[i]
+                self.moves.set_valid(is_valid, state)
+                guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
+                action = self.moves.c[guess]
+                action.do(state, action.label)
+                state.push_hist(guess)
+                if not state.is_final():
+                    unfinished.push_back(state)
+            for i in range(unfinished.size()):
+                states[i] = unfinished[i]
+            nr_todo = unfinished.size()
+            unfinished.clear()
        free(token_ids)
        free(is_valid)
        free(vectors)
+        free(unmaxed)
        free(scores)

-    def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
+    def beam_parse(self, docs, int beam_width=3, float beam_density=0.001,
+            float drop=0.):
        cdef Beam beam
        cdef np.ndarray scores
        cdef Doc doc
        cdef int nr_class = self.moves.n_moves
        cuda_stream = util.get_cuda_stream()
        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
-            docs, cuda_stream, 0.0)
+            docs, cuda_stream, drop)
        cdef int offset = 0
        cdef int j = 0
        cdef int k
@ -524,8 +532,8 @@ cdef class Parser:
                        n_states += 1
            if n_states == 0:
                break
-            vectors = state2vec(token_ids[:n_states])
-            scores = vec2scores(vectors)
+            vectors, _ = state2vec.begin_update(token_ids[:n_states], drop)
+            scores, _ = vec2scores.begin_update(vectors, drop=drop)
            c_scores = <float*>scores.data
            for beam in todo:
                for i in range(beam.size):
@ -556,7 +564,10 @@ cdef class Parser:
        for multitask in self._multitasks:
            multitask.update(docs, golds, drop=drop, sgd=sgd)
        cuda_stream = util.get_cuda_stream()
-        states, golds, max_steps = self._init_gold_batch(docs, golds)
+        # Chop sequences into lengths of this many transitions, to make the
+        # batch uniform length.
+        cut_gold = numpy.random.choice(range(20, 100))
+        states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
                                                                            drop)
        todo = [(s, g) for (s, g) in zip(states, golds)
@ -659,8 +670,7 @@ cdef class Parser:
        for beam in beams:
            _cleanup(beam)

-
-    def _init_gold_batch(self, whole_docs, whole_golds):
+    def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
        """Make a square batch, of length equal to the shortest doc. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
        where N is the shortest doc. We'll make two states, one representing
@ -669,7 +679,7 @@ cdef class Parser:
            StateClass state
            Transition action
        whole_states = self.moves.init_batch(whole_docs)
-        max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
+        max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
        max_moves = 0
        states = []
        golds = []
@ -791,6 +801,11 @@ cdef class Parser:
                for doc in docs:
                    hook(doc)

+    @property
+    def labels(self):
+        class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
+        return class_names
+
    @property
    def tok2vec(self):
        '''Return the embedding and convolutional layer of the model.'''
@ -809,9 +824,6 @@ cdef class Parser:
        for action in self.moves.action_types:
            added = self.moves.add_action(action, label)
            if added:
-                # Important that the labels be stored as a list! We need the
-                # order, or the model goes out of synch
-                self.cfg.setdefault('extra_labels', []).append(label)
                resized = True
        if self.model not in (True, False, None) and resized:
            # Weights are stored in (nr_out, nr_in) format, so we're basically
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -9,7 +9,7 @@ from __future__ import unicode_literals

 from copy import copy

-from ..tokens.doc cimport Doc
+from ..tokens.doc cimport Doc, set_children_from_heads


 DELIMITER = '||'
@ -74,7 +74,21 @@ def decompose(label):


 def is_decorated(label):
-    return label.find(DELIMITER) != -1
+    return DELIMITER in label
+
+def count_decorated_labels(gold_tuples):
+    freqs = {}
+    for raw_text, sents in gold_tuples:
+        for (ids, words, tags, heads, labels, iob), ctnts in sents:
+            proj_heads, deco_labels = projectivize(heads, labels)
+            # set the label to ROOT for each root dependent
+            deco_labels = ['ROOT' if head == i else deco_labels[i]
+                           for i, head in enumerate(proj_heads)]
+            # count label frequencies
+            for label in deco_labels:
+                if is_decorated(label):
+                    freqs[label] = freqs.get(label, 0) + 1
+    return freqs


 def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
@ -124,8 +138,9 @@ cpdef deprojectivize(Doc doc):
        if DELIMITER in label:
            new_label, head_label = label.split(DELIMITER)
            new_head = _find_new_head(doc[i], head_label)
-            doc[i].head = new_head
+            doc.c[i].head = new_head.i - i
            doc.c[i].dep = doc.vocab.strings.add(new_label)
+    set_children_from_heads(doc.c, doc.length)
    return doc


@ -191,9 +206,12 @@ def _filter_labels(gold_tuples, cutoff, freqs):
    for raw_text, sents in gold_tuples:
        filtered_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
-            filtered_labels = [decompose(label)[0]
-                               if freqs.get(label, cutoff) < cutoff
-                               else label for label in labels]
+            filtered_labels = []
+            for label in labels:
+                if is_decorated(label) and freqs.get(label, 0) < cutoff:
+                    filtered_labels.append(decompose(label)[0])
+                else:
+                    filtered_labels.append(label)
            filtered_sents.append(
                ((ids, words, tags, heads, filtered_labels, iob), ctnts))
        filtered.append((raw_text, filtered_sents))
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -42,6 +42,7 @@ cdef class TransitionSystem:
    cdef public attr_t root_label
    cdef public freqs
    cdef init_state_t init_beam_state
+    cdef public object labels

    cdef int initialize_state(self, StateC* state) nogil
    cdef int finalize_state(self, StateC* state) nogil
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -5,7 +5,7 @@ from __future__ import unicode_literals
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
-from collections import OrderedDict
+from collections import OrderedDict, Counter
 import ujson

 from ..structs cimport TokenC
@ -28,7 +28,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:


 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, labels_by_action):
+    def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
        self.mem = Pool()
        self.strings = string_table
        self.n_moves = 0
@ -36,21 +36,14 @@ cdef class TransitionSystem:

        self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))

-        for action, label_strs in labels_by_action.items():
-            for label_str in label_strs:
-                self.add_action(int(action), label_str)
+        self.labels = {}
+        if labels_by_action:
+            self.initialize_actions(labels_by_action, min_freq=min_freq)
        self.root_label = self.strings.add('ROOT')
        self.init_beam_state = _init_state

    def __reduce__(self):
-        labels_by_action = OrderedDict()
-        cdef Transition t
-        for trans in self.c[:self.n_moves]:
-            label_str = self.strings[trans.label]
-            labels_by_action.setdefault(trans.move, []).append(label_str)
-        return (self.__class__,
-                (self.strings, labels_by_action),
-                None, None)
+        return (self.__class__, (self.strings, self.labels), None, None)

    def init_batch(self, docs):
        cdef StateClass state
@ -146,6 +139,22 @@ cdef class TransitionSystem:
        act = self.c[clas]
        return self.move_name(act.move, act.label)

+    def initialize_actions(self, labels_by_action, min_freq=None):
+        self.labels = {}
+        self.n_moves = 0
+        for action, label_freqs in sorted(labels_by_action.items()):
+            action = int(action)
+            # Make sure we take a copy here, and that we get a Counter
+            self.labels[action] = Counter()
+            # Have to be careful here: Sorting must be stable, or our model
+            # won't be read back in correctly. 
+            sorted_labels = [(f, L) for L, f in label_freqs.items()]
+            sorted_labels.sort()
+            sorted_labels.reverse()
+            for freq, label_str in sorted_labels:
+                self.add_action(int(action), label_str)
+                self.labels[action][label_str] = freq 
+
    def add_action(self, int action, label_name):
        cdef attr_t label_id
        if not isinstance(label_name, int) and \
@ -164,6 +173,14 @@ cdef class TransitionSystem:
        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
+        if self.labels.get(action, []):
+            new_freq = min(self.labels[action].values())
+        else:
+            self.labels[action] = Counter()
+            new_freq = -1
+        if new_freq > 0:
+            new_freq = 0
+        self.labels[action][label_name] = new_freq-1
        return 1

    def to_disk(self, path, **exclude):
@ -178,26 +195,18 @@ cdef class TransitionSystem:

    def to_bytes(self, **exclude):
        transitions = []
-        for trans in self.c[:self.n_moves]:
-            transitions.append({
-                'clas': trans.clas,
-                'move': trans.move,
-                'label': self.strings[trans.label],
-                'name': self.move_name(trans.move, trans.label)
-            })
        serializers = {
-            'transitions': lambda: json_dumps(transitions),
+            'moves': lambda: json_dumps(self.labels),
            'strings': lambda: self.strings.to_bytes()
        }
        return util.to_bytes(serializers, exclude)

    def from_bytes(self, bytes_data, **exclude):
-        transitions = []
+        labels = {}
        deserializers = {
-            'transitions': lambda b: transitions.extend(ujson.loads(b)),
+            'moves': lambda b: labels.update(ujson.loads(b)),
            'strings': lambda b: self.strings.from_bytes(b)
        }
        msg = util.from_bytes(bytes_data, deserializers, exclude)
-        for trans in transitions:
-            self.add_action(trans['move'], trans['label'])
+        self.initialize_actions(labels)
        return self
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -19,6 +19,15 @@ def doc(en_tokenizer):
    return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)


+@pytest.fixture
+def doc_not_parsed(en_tokenizer):
+    text = "This is a sentence. This is another sentence. And a third."
+    tokens = en_tokenizer(text)
+    d = get_doc(tokens.vocab, [t.text for t in tokens])
+    d.is_parsed = False
+    return d
+
+
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
    assert span.root.text == 'sentence'
    assert span.root.head.text == 'is'

+
 def test_spans_string_fn(doc):
    span = doc[0:4]
    assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
    assert span.upper_ == 'THIS IS A SENTENCE'
    assert span.lower_ == 'this is a sentence'

+
 def test_spans_root2(en_tokenizer):
    text = "through North and South Carolina"
    heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
    assert doc[-2:].root.text == 'Carolina'


-def test_spans_span_sent(doc):
+def test_spans_span_sent(doc, doc_not_parsed):
    """Test span.sent property"""
    assert len(list(doc.sents))
    assert doc[:2].sent.root.text == 'is'
    assert doc[:2].sent.text == 'This is a sentence .'
    assert doc[6:7].sent.root.left_edge.text == 'This'
+    # test on manual sbd
+    doc_not_parsed[0].is_sent_start = True
+    doc_not_parsed[5].is_sent_start = True
+    assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
+    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]


 def test_spans_lca_matrix(en_tokenizer):
@ -129,7 +145,7 @@ def test_span_to_array(doc):
    assert arr[0, 1] == len(span[0])


-def test_span_as_doc(doc):
-    span = doc[4:10]
-    span_doc = span.as_doc()
-    assert span.text == span_doc.text.strip()
+#def test_span_as_doc(doc):
+#    span = doc[4:10]
+#    span_doc = span.as_doc()
+#    assert span.text == span_doc.text.strip()
--- a/spacy/tests/gold/test_lev_align.py
+++ b/spacy/tests/gold/test_lev_align.py
@ -1,36 +0,0 @@
-# coding: utf-8
-"""Find the min-cost alignment between two tokenizations"""
-
-from __future__ import unicode_literals
-
-from ...gold import _min_edit_path as min_edit_path
-from ...gold import align
-
-import pytest
-
-
-@pytest.mark.parametrize('cand,gold,path', [
-    (["U.S", ".", "policy"], ["U.S.", "policy"], (0, 'MDM')),
-    (["U.N", ".", "policy"], ["U.S.", "policy"], (1, 'SDM')),
-    (["The", "cat", "sat", "down"], ["The", "cat", "sat", "down"], (0, 'MMMM')),
-    (["cat", "sat", "down"], ["The", "cat", "sat", "down"], (1, 'IMMM')),
-    (["The", "cat", "down"], ["The", "cat", "sat", "down"], (1, 'MMIM')),
-    (["The", "cat", "sag", "down"], ["The", "cat", "sat", "down"], (1, 'MMSM'))])
-def test_gold_lev_align_edit_path(cand, gold, path):
-    assert min_edit_path(cand, gold) == path
-
-
-def test_gold_lev_align_edit_path2():
-    cand = ["your", "stuff"]
-    gold = ["you", "r", "stuff"]
-    assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')]
-
-
-@pytest.mark.parametrize('cand,gold,result', [
-    (["U.S", ".", "policy"], ["U.S.", "policy"], [0, None, 1]),
-    (["your", "stuff"], ["you", "r", "stuff"], [None, 2]),
-    (["i", "like", "2", "guys", "   ", "well", "id", "just", "come", "straight", "out"],
-     ["i", "like", "2", "guys", "well", "i", "d", "just", "come", "straight", "out"],
-     [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10])])
-def test_gold_lev_align(cand, gold, result):
-    assert align(cand, gold) == result
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -2,9 +2,9 @@
 from __future__ import unicode_literals

 from ....parts_of_speech import SPACE
+from ....compat import unicode_
 from ...util import get_doc

-import six
 import pytest


@ -24,8 +24,8 @@ def test_tag_names(EN):
    text = "I ate pizzas with anchovies."
    doc = EN(text, disable=['parser'])
    assert type(doc[2].pos) == int
-    assert isinstance(doc[2].pos_, six.text_type)
-    assert isinstance(doc[2].dep_, six.text_type)
+    assert isinstance(doc[2].pos_, unicode_)
+    assert isinstance(doc[2].dep_, unicode_)
    assert doc[2].tag_ == u'NNS'


--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@ -0,0 +1,75 @@
+from __future__ import unicode_literals
+from ...vocab import Vocab
+from ...pipeline import DependencyParser
+from ...tokens import Doc
+from ...gold import GoldParse
+from ...syntax.nonproj import projectivize
+
+annot_tuples = [
+    (0, 'When', 'WRB', 11, 'advmod', 'O'),
+    (1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
+    (2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
+    (3, ',', ',', 2, 'punct', 'O'),
+    (4, 'our', 'PRP$', 6, 'poss', 'O'),
+    (5, 'embedded', 'VBN', 6, 'amod', 'O'),
+    (6, 'reporter', 'NN', 2, 'appos', 'O'),
+    (7, 'with', 'IN', 6, 'prep', 'O'),
+    (8, 'the', 'DT', 10, 'det', 'B-ORG'),
+    (9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
+    (10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
+    (11, 'says', 'VBZ', 44, 'advcl', 'O'),
+    (12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
+    (13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
+    (14, 'of', 'IN', 13, 'prep', 'O'),
+    (15, 'troops', 'NNS', 14, 'pobj', 'O'),
+    (16, 'are', 'VBP', 11, 'ccomp', 'O'),
+    (17, 'on', 'IN', 16, 'prep', 'O'),
+    (18, 'the', 'DT', 19, 'det', 'O'),
+    (19, 'ground', 'NN', 17, 'pobj', 'O'),
+    (20, ',', ',', 17, 'punct', 'O'),
+    (21, 'inside', 'IN', 17, 'prep', 'O'),
+    (22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
+    (23, 'itself', 'PRP', 22, 'appos', 'O'),
+    (24, ',', ',', 16, 'punct', 'O'),
+    (25, 'have', 'VBP', 26, 'aux', 'O'),
+    (26, 'taken', 'VBN', 16, 'dep', 'O'),
+    (27, 'up', 'RP', 26, 'prt', 'O'),
+    (28, 'positions', 'NNS', 26, 'dobj', 'O'),
+    (29, 'they', 'PRP', 31, 'nsubj', 'O'),
+    (30, "'re", 'VBP', 31, 'aux', 'O'),
+    (31, 'going', 'VBG', 26, 'parataxis', 'O'),
+    (32, 'to', 'TO', 33, 'aux', 'O'),
+    (33, 'spend', 'VB', 31, 'xcomp', 'O'),
+    (34, 'the', 'DT', 35, 'det', 'B-TIME'), 
+    (35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
+    (36, 'there', 'RB', 33, 'advmod', 'O'),
+    (37, 'presumably', 'RB', 33, 'advmod', 'O'),
+    (38, ',', ',', 44, 'punct', 'O'),
+    (39, 'how', 'WRB', 40, 'advmod', 'O'),
+    (40, 'many', 'JJ', 41, 'amod', 'O'),
+    (41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
+    (42, 'are', 'VBP', 44, 'aux', 'O'),
+    (43, 'we', 'PRP', 44, 'nsubj', 'O'),
+    (44, 'talking', 'VBG', 44, 'ROOT', 'O'),
+    (45, 'about', 'IN', 44, 'prep', 'O'),
+    (46, 'right', 'RB', 47, 'advmod', 'O'),
+    (47, 'now', 'RB', 44, 'advmod', 'O'),
+    (48, '?', '.', 44, 'punct', 'O')]
+
+def test_get_oracle_actions():
+    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
+    parser = DependencyParser(doc.vocab)
+    parser.moves.add_action(0, '')
+    parser.moves.add_action(1, '')
+    parser.moves.add_action(1, '')
+    parser.moves.add_action(4, 'ROOT')
+    for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
+        if head > i:
+            parser.moves.add_action(2, dep)
+        elif head < i:
+            parser.moves.add_action(3, dep)
+    ids, words, tags, heads, deps, ents = zip(*annot_tuples)
+    heads, deps = projectivize(heads, deps)
+    gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
+    parser.moves.preprocess_gold(gold)
+    actions = parser.moves.get_oracle_sequence(doc, gold)
--- a/spacy/tests/regression/test_issue1450.py
+++ b/spacy/tests/regression/test_issue1450.py
@ -13,8 +13,8 @@ from ...vocab import Vocab
        ('a b', 0, 2),
        ('a c', 0, 1),
        ('a b c', 0, 2),
-        ('a b b c', 0, 2),
-        ('a b b', 0, 2),
+        ('a b b c', 0, 3),
+        ('a b b', 0, 3),
    ]
 )
 def test_issue1450_matcher_end_zero_plus(string, start, end):
@ -54,5 +54,6 @@ def test_issue1450_matcher_end_zero_plus(string, start, end):
    if start is None or end is None:
        assert matches == []
    
-    assert matches[0][1] == start
-    assert matches[0][2] == end
+    print(matches)
+    assert matches[-1][1] == start
+    assert matches[-1][2] == end
--- a/spacy/tests/regression/test_issue1855.py
+++ b/spacy/tests/regression/test_issue1855.py
@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from ...matcher import Matcher
+
+import pytest
+
+pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
+pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
+pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
+pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
+
+re_pattern1	= 'AA*'
+re_pattern2 = 'A*A'
+re_pattern3	= 'AA'
+re_pattern4	= 'BA*B'
+re_pattern5	= 'B*A*B'
+
+@pytest.fixture
+def text():
+	return "(ABBAAAAAB)."
+
+@pytest.fixture
+def doc(en_tokenizer,text):
+    doc = en_tokenizer(' '.join(text))
+    return doc
+
+@pytest.mark.xfail
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_greedy_matching(doc,text,pattern,re_pattern):
+	"""
+	Test that the greedy matching behavior of the * op
+	is consistant with other re implementations
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	for match,re_match in zip(matches,re_matches):
+		assert match[1:]==re_match
+
+@pytest.mark.xfail
+@pytest.mark.parametrize('pattern,re_pattern',[
+	(pattern1,re_pattern1),
+	(pattern2,re_pattern2),
+	(pattern3,re_pattern3),
+	(pattern4,re_pattern4),
+	(pattern5,re_pattern5)])
+def test_match_consuming(doc,text,pattern,re_pattern):
+	"""
+	Test that matcher.__call__ consumes tokens on a match
+	similar to re.findall
+	"""
+	matcher = Matcher(doc.vocab)
+	matcher.add(re_pattern,None,pattern)
+	matches = matcher(doc)
+	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
+	assert len(matches)==len(re_matches)
--- a/spacy/tests/regression/test_issue1889.py
+++ b/spacy/tests/regression/test_issue1889.py
@ -0,0 +1,11 @@
+# coding: utf-8
+from __future__ import unicode_literals
+from ...lang.lex_attrs import is_stop
+from ...lang.en.stop_words import STOP_WORDS
+
+import pytest
+
+
+@pytest.mark.parametrize('word', ['the'])
+def test_lex_attrs_stop_words_case_sensitivity(word):
+    assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
--- a/spacy/tests/regression/test_issue1945.py
+++ b/spacy/tests/regression/test_issue1945.py
@ -6,7 +6,6 @@ from ...vocab import Vocab
 from ...tokens import Doc
 from ...matcher import Matcher

-@pytest.mark.xfail
 def test_issue1945():
    text = "a a a"
    matcher = Matcher(Vocab())
--- a/spacy/tests/regression/test_issue850.py
+++ b/spacy/tests/regression/test_issue850.py
@ -22,10 +22,9 @@ def test_basic_case():
    assert end == 4


-@pytest.mark.xfail
 def test_issue850():
-    """The problem here is that the variable-length pattern matches the
-    succeeding token. We then don't handle the ambiguity correctly."""
+    """The variable-length pattern matches the
+    succeeding token. Check we handle the ambiguity correctly."""
    matcher = Matcher(Vocab(
                lex_attr_getters={LOWER: lambda string: string.lower()}))
    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
--- a/spacy/tests/test_align.py
+++ b/spacy/tests/test_align.py
@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+import pytest
+from .._align import align, multi_align
+
+
+@pytest.mark.parametrize('string1,string2,cost', [
+    ('hello', 'hell', 1),
+    ('rat', 'cat', 1),
+    ('rat', 'rat', 0),
+    ('rat', 'catsie', 4),
+    ('t', 'catsie', 5),
+])
+def test_align_costs(string1, string2, cost):
+    output_cost, i2j, j2i, matrix = align(string1, string2)
+    assert output_cost == cost
+
+
+@pytest.mark.parametrize('string1,string2,i2j', [
+    ('hello', 'hell', [0,1,2,3,-1]),
+    ('rat', 'cat', [0,1,2]),
+    ('rat', 'rat', [0,1,2]),
+    ('rat', 'catsie', [0,1,2]),
+    ('t', 'catsie', [2]),
+])
+def test_align_i2j(string1, string2, i2j):
+    output_cost, output_i2j, j2i, matrix = align(string1, string2)
+    assert list(output_i2j) == i2j
+
+
+@pytest.mark.parametrize('string1,string2,j2i', [
+    ('hello', 'hell', [0,1,2,3]),
+    ('rat', 'cat', [0,1,2]),
+    ('rat', 'rat', [0,1,2]),
+    ('rat', 'catsie', [0,1,2, -1, -1, -1]),
+    ('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
+])
+def test_align_i2j(string1, string2, j2i):
+    output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
+    assert list(output_j2i) == j2i
+
+def test_align_strings():
+    words1 = ['hello', 'this', 'is', 'test!']
+    words2 = ['hellothis', 'is', 'test', '!']
+    cost, i2j, j2i, matrix = align(words1, words2)
+    assert cost == 4
+    assert list(i2j) == [-1, -1, 1, -1]
+    assert list(j2i) == [-1, 2, -1, -1]
+
+def test_align_many_to_one():
+    words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
+    words2 = ['ab', 'bc', 'e', 'fg', 'h']
+    cost, i2j, j2i, matrix = align(words1, words2)
+    assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
+    lengths1 = [len(w) for w in words1]
+    lengths2 = [len(w) for w in words2]
+    i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
+    assert i2j_multi[0] == 0
+    assert i2j_multi[1] == 0
+    assert i2j_multi[2] == 1
+    assert i2j_multi[3] == 1
+    assert i2j_multi[3] == 1
+    assert i2j_multi[5] == 3
+    assert i2j_multi[6] == 3
+
+    assert j2i_multi[0] == 1
+    assert j2i_multi[1] == 3
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@ -3,12 +3,17 @@ from __future__ import unicode_literals

 from ..matcher import Matcher, PhraseMatcher
 from .util import get_doc
+from ..util import get_lang_class
 from ..tokens import Doc

 import pytest

+@pytest.fixture(scope="session")
+def en_vocab():
+    return get_lang_class('en').Defaults.create_vocab()

-@pytest.fixture
+
+@pytest.fixture(scope="session")
 def matcher(en_vocab):
    rules = {
        'JS':        [[{'ORTH': 'JavaScript'}]],
@ -21,187 +26,196 @@ def matcher(en_vocab):
    return matcher


-def test_matcher_from_api_docs(en_vocab):
-    matcher = Matcher(en_vocab)
-    pattern = [{'ORTH': 'test'}]
-    assert len(matcher) == 0
-    matcher.add('Rule', None, pattern)
-    assert len(matcher) == 1
-    matcher.remove('Rule')
-    assert 'Rule' not in matcher
-    matcher.add('Rule', None, pattern)
-    assert 'Rule' in matcher
-    on_match, patterns = matcher.get('Rule')
-    assert len(patterns[0])
+#def test_matcher_from_api_docs(en_vocab):
+#    matcher = Matcher(en_vocab)
+#    pattern = [{'ORTH': 'test'}]
+#    assert len(matcher) == 0
+#    matcher.add('Rule', None, pattern)
+#    assert len(matcher) == 1
+#    matcher.remove('Rule')
+#    assert 'Rule' not in matcher
+#    matcher.add('Rule', None, pattern)
+#    assert 'Rule' in matcher
+#    on_match, patterns = matcher.get('Rule')
+#    assert len(patterns[0])
+#
+#
+#def test_matcher_from_usage_docs(en_vocab):
+#    text = "Wow 😀 This is really cool! 😂 😂"
+#    doc = get_doc(en_vocab, words=text.split(' '))
+#    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
+#    pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
+#
+#    def label_sentiment(matcher, doc, i, matches):
+#        match_id, start, end = matches[i]
+#        if doc.vocab.strings[match_id] == 'HAPPY':
+#            doc.sentiment += 0.1
+#        span = doc[start : end]
+#        token = span.merge()
+#        token.vocab[token.text].norm_ = 'happy emoji'
+#
+#    matcher = Matcher(en_vocab)
+#    matcher.add('HAPPY', label_sentiment, *pos_patterns)
+#    matches = matcher(doc)
+#    assert doc.sentiment != 0
+#    assert doc[1].norm_ == 'happy emoji'


-def test_matcher_from_usage_docs(en_vocab):
-    text = "Wow 😀 This is really cool! 😂 😂"
-    doc = get_doc(en_vocab, words=text.split(' '))
-    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
-    pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
-
-    def label_sentiment(matcher, doc, i, matches):
-        match_id, start, end = matches[i]
-        if doc.vocab.strings[match_id] == 'HAPPY':
-            doc.sentiment += 0.1
-        span = doc[start : end]
-        token = span.merge()
-        token.vocab[token.text].norm_ = 'happy emoji'
-
-    matcher = Matcher(en_vocab)
-    matcher.add('HAPPY', label_sentiment, *pos_patterns)
-    matches = matcher(doc)
-    assert doc.sentiment != 0
-    assert doc[1].norm_ == 'happy emoji'
-
-
-@pytest.mark.parametrize('words', [["Some", "words"]])
-def test_matcher_init(en_vocab, words):
-    matcher = Matcher(en_vocab)
-    doc = get_doc(en_vocab, words)
-    assert len(matcher) == 0
-    assert matcher(doc) == []
-
-
-def test_matcher_contains(matcher):
-    matcher.add('TEST', None, [{'ORTH': 'test'}])
-    assert 'TEST' in matcher
-    assert 'TEST2' not in matcher
-
-
-def test_matcher_no_match(matcher):
-    words = ["I", "like", "cheese", "."]
-    doc = get_doc(matcher.vocab, words)
-    assert matcher(doc) == []
-
-
-def test_matcher_compile(matcher):
-    assert len(matcher) == 3
-
-
-def test_matcher_match_start(matcher):
-    words = ["JavaScript", "is", "good"]
-    doc = get_doc(matcher.vocab, words)
-    assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
-
-
-def test_matcher_match_end(matcher):
-    words = ["I", "like", "java"]
-    doc = get_doc(matcher.vocab, words)
-    assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
-
-
-def test_matcher_match_middle(matcher):
-    words = ["I", "like", "Google", "Now", "best"]
-    doc = get_doc(matcher.vocab, words)
-    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
-
-
-def test_matcher_match_multi(matcher):
-    words = ["I", "like", "Google", "Now", "and", "java", "best"]
-    doc = get_doc(matcher.vocab, words)
-    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
-                            (doc.vocab.strings['Java'], 5, 6)]
-
-
-def test_matcher_empty_dict(en_vocab):
-    '''Test matcher allows empty token specs, meaning match on any token.'''
-    matcher = Matcher(en_vocab)
-    abc = ["a", "b", "c"]
-    doc = get_doc(matcher.vocab, abc)
-    matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
-    matches = matcher(doc)
-    assert len(matches) == 1
-    assert matches[0][1:] == (0, 3)
-    matcher = Matcher(en_vocab)
-    matcher.add('A.', None, [{'ORTH': 'a'}, {}])
-    matches = matcher(doc)
-    assert matches[0][1:] == (0, 2)
-
-
-def test_matcher_operator_shadow(en_vocab):
-    matcher = Matcher(en_vocab)
-    abc = ["a", "b", "c"]
-    doc = get_doc(matcher.vocab, abc)
-    matcher.add('A.C', None, [{'ORTH': 'a'},
-                              {"IS_ALPHA": True, "OP": "+"},
-                              {'ORTH': 'c'}])
-    matches = matcher(doc)
-    assert len(matches) == 1
-    assert matches[0][1:] == (0, 3)
-
-
-def test_matcher_phrase_matcher(en_vocab):
-    words = ["Google", "Now"]
-    doc = get_doc(en_vocab, words)
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add('COMPANY', None, doc)
-    words = ["I", "like", "Google", "Now", "best"]
-    doc = get_doc(en_vocab, words)
-    assert len(matcher(doc)) == 1
-
-
-def test_phrase_matcher_length(en_vocab):
-    matcher = PhraseMatcher(en_vocab)
-    assert len(matcher) == 0
-    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
-    assert len(matcher) == 1
-    matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
-    assert len(matcher) == 2
-
-
-def test_phrase_matcher_contains(en_vocab):
-    matcher = PhraseMatcher(en_vocab)
-    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
-    assert 'TEST' in matcher
-    assert 'TEST2' not in matcher
-
-
-def test_matcher_match_zero(matcher):
-    words1 = 'He said , " some words " ...'.split()
-    words2 = 'He said , " some three words " ...'.split()
-    pattern1 = [{'ORTH': '"'},
-                {'OP': '!', 'IS_PUNCT': True},
-                {'OP': '!', 'IS_PUNCT': True},
-                {'ORTH': '"'}]
-    pattern2 = [{'ORTH': '"'},
-                {'IS_PUNCT': True},
-                {'IS_PUNCT': True},
-                {'IS_PUNCT': True},
-                {'ORTH': '"'}]
-
-    matcher.add('Quote', None, pattern1)
-    doc = get_doc(matcher.vocab, words1)
-    assert len(matcher(doc)) == 1
-
-    doc = get_doc(matcher.vocab, words2)
-    assert len(matcher(doc)) == 0
-    matcher.add('Quote', None, pattern2)
-    assert len(matcher(doc)) == 0
-
-
-def test_matcher_match_zero_plus(matcher):
-    words = 'He said , " some words " ...'.split()
-    pattern = [{'ORTH': '"'},
-               {'OP': '*', 'IS_PUNCT': False},
-               {'ORTH': '"'}]
-    matcher.add('Quote', None, pattern)
-    doc = get_doc(matcher.vocab, words)
-    assert len(matcher(doc)) == 1
-
-
-def test_matcher_match_one_plus(matcher):
-    control = Matcher(matcher.vocab)
-    control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
-    doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
-    m = control(doc)
-    assert len(m) == 2
-    matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
-                                         {'ORTH': 'Philippe', 'OP': '+'}])
-    m = matcher(doc)
-    assert len(m) == 1
-
+#@pytest.mark.parametrize('words', [["Some", "words"]])
+#def test_matcher_init(en_vocab, words):
+#    matcher = Matcher(en_vocab)
+#    doc = get_doc(en_vocab, words)
+#    assert len(matcher) == 0
+#    assert matcher(doc) == []
+#
+#
+#def test_matcher_contains(matcher):
+#    matcher.add('TEST', None, [{'ORTH': 'test'}])
+#    assert 'TEST' in matcher
+#    assert 'TEST2' not in matcher
+#
+#
+#def test_matcher_no_match(matcher):
+#    words = ["I", "like", "cheese", "."]
+#    doc = get_doc(matcher.vocab, words)
+#    assert matcher(doc) == []
+#
+#
+#def test_matcher_compile(en_vocab):
+#    rules = {
+#        'JS':        [[{'ORTH': 'JavaScript'}]],
+#        'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
+#        'Java':      [[{'LOWER': 'java'}]]
+#    }
+#    matcher = Matcher(en_vocab)
+#    for key, patterns in rules.items():
+#        matcher.add(key, None, *patterns)
+#    assert len(matcher) == 3
+#
+#
+#def test_matcher_match_start(matcher):
+#    words = ["JavaScript", "is", "good"]
+#    doc = get_doc(matcher.vocab, words)
+#    assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
+#
+#
+#def test_matcher_match_end(matcher):
+#    words = ["I", "like", "java"]
+#    doc = get_doc(matcher.vocab, words)
+#    assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
+#
+#
+#def test_matcher_match_middle(matcher):
+#    words = ["I", "like", "Google", "Now", "best"]
+#    doc = get_doc(matcher.vocab, words)
+#    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
+#
+#
+#def test_matcher_match_multi(matcher):
+#    words = ["I", "like", "Google", "Now", "and", "java", "best"]
+#    doc = get_doc(matcher.vocab, words)
+#    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
+#                            (doc.vocab.strings['Java'], 5, 6)]
+#
+#
+#def test_matcher_empty_dict(en_vocab):
+#    '''Test matcher allows empty token specs, meaning match on any token.'''
+#    matcher = Matcher(en_vocab)
+#    abc = ["a", "b", "c"]
+#    doc = get_doc(matcher.vocab, abc)
+#    matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
+#    matches = matcher(doc)
+#    assert len(matches) == 1
+#    assert matches[0][1:] == (0, 3)
+#    matcher = Matcher(en_vocab)
+#    matcher.add('A.', None, [{'ORTH': 'a'}, {}])
+#    matches = matcher(doc)
+#    assert matches[0][1:] == (0, 2)
+#
+#
+#def test_matcher_operator_shadow(en_vocab):
+#    matcher = Matcher(en_vocab)
+#    abc = ["a", "b", "c"]
+#    doc = get_doc(matcher.vocab, abc)
+#    matcher.add('A.C', None, [{'ORTH': 'a'},
+#                              {"IS_ALPHA": True, "OP": "+"},
+#                              {'ORTH': 'c'}])
+#    matches = matcher(doc)
+#    assert len(matches) == 1
+#    assert matches[0][1:] == (0, 3)
+#
+#
+#def test_matcher_phrase_matcher(en_vocab):
+#    words = ["Google", "Now"]
+#    doc = get_doc(en_vocab, words)
+#    matcher = PhraseMatcher(en_vocab)
+#    matcher.add('COMPANY', None, doc)
+#    words = ["I", "like", "Google", "Now", "best"]
+#    doc = get_doc(en_vocab, words)
+#    assert len(matcher(doc)) == 1
+#
+#
+#def test_phrase_matcher_length(en_vocab):
+#    matcher = PhraseMatcher(en_vocab)
+#    assert len(matcher) == 0
+#    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
+#    assert len(matcher) == 1
+#    matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
+#    assert len(matcher) == 2
+#
+#
+#def test_phrase_matcher_contains(en_vocab):
+#    matcher = PhraseMatcher(en_vocab)
+#    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
+#    assert 'TEST' in matcher
+#    assert 'TEST2' not in matcher
+#
+#
+#def test_matcher_match_zero(matcher):
+#    words1 = 'He said , " some words " ...'.split()
+#    words2 = 'He said , " some three words " ...'.split()
+#    pattern1 = [{'ORTH': '"'},
+#                {'OP': '!', 'IS_PUNCT': True},
+#                {'OP': '!', 'IS_PUNCT': True},
+#                {'ORTH': '"'}]
+#    pattern2 = [{'ORTH': '"'},
+#                {'IS_PUNCT': True},
+#                {'IS_PUNCT': True},
+#                {'IS_PUNCT': True},
+#                {'ORTH': '"'}]
+#
+#    matcher.add('Quote', None, pattern1)
+#    doc = get_doc(matcher.vocab, words1)
+#    assert len(matcher(doc)) == 1
+#
+#    doc = get_doc(matcher.vocab, words2)
+#    assert len(matcher(doc)) == 0
+#    matcher.add('Quote', None, pattern2)
+#    assert len(matcher(doc)) == 0
+#
+#
+#def test_matcher_match_zero_plus(matcher):
+#    words = 'He said , " some words " ...'.split()
+#    pattern = [{'ORTH': '"'},
+#               {'OP': '*', 'IS_PUNCT': False},
+#               {'ORTH': '"'}]
+#    matcher = Matcher(matcher.vocab)
+#    matcher.add('Quote', None, pattern)
+#    doc = get_doc(matcher.vocab, words)
+#    assert len(matcher(doc)) == 1
+#
+#
+#def test_matcher_match_one_plus(matcher):
+#    control = Matcher(matcher.vocab)
+#    control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
+#    doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
+#    m = control(doc)
+#    assert len(m) == 2
+#    matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
+#                                         {'ORTH': 'Philippe', 'OP': '+'}])
+#    m = matcher(doc)
+#    assert len(m) == 1
+#

 def test_operator_combos(matcher):
    cases = [
@ -252,9 +266,8 @@ def test_matcher_end_zero_plus(matcher):
    )
    nlp = lambda string: Doc(matcher.vocab, words=string.split())
    assert len(matcher(nlp(u'a'))) == 1
-    assert len(matcher(nlp(u'a b'))) == 1
-    assert len(matcher(nlp(u'a b'))) == 1
+    assert len(matcher(nlp(u'a b'))) == 2
    assert len(matcher(nlp(u'a c'))) == 1
-    assert len(matcher(nlp(u'a b c'))) == 1
-    assert len(matcher(nlp(u'a b b c'))) == 1
-    assert len(matcher(nlp(u'a b b'))) == 1
+    assert len(matcher(nlp(u'a b c'))) == 2
+    assert len(matcher(nlp(u'a b b c'))) == 3
+    assert len(matcher(nlp(u'a b b'))) == 3
--- a/spacy/tests/test_textcat.py
+++ b/spacy/tests/test_textcat.py
@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+import random
+import numpy.random
+
+from ..pipeline import TextCategorizer
+from ..lang.en import English
+from ..vocab import Vocab
+from ..tokens import Doc
+from ..gold import GoldParse
+
+
+def test_textcat_learns_multilabel():
+    random.seed(0)
+    numpy.random.seed(0)
+    docs = []
+    nlp = English()
+    vocab = nlp.vocab
+    letters = ['a', 'b', 'c']
+    for w1 in letters:
+        for w2 in letters:
+            cats = {letter: float(w2==letter) for letter in letters}
+            docs.append((Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
+    random.shuffle(docs)
+    model = TextCategorizer(vocab, width=8)
+    for letter in letters:
+        model.add_label(letter)
+    optimizer = model.begin_training()
+    for i in range(30):
+        losses = {}
+        Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
+        Xs = [doc for doc, cats in docs]
+        model.update(Xs, Ys, sgd=optimizer, losses=losses)
+        random.shuffle(docs)
+    for w1 in letters:
+        for w2 in letters:
+            doc = Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3)
+            truth = {letter: w2==letter for letter in letters}
+            model(doc)
+            for cat, score in doc.cats.items():
+                if not truth[cat]:
+                    assert score < 0.5
+                else:
+                    assert score > 0.5
+
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -19,6 +19,9 @@ ctypedef fused LexemeOrToken:
    const_TokenC_ptr


+cdef int set_children_from_heads(TokenC* tokens, int length) except -1
+
+
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2


--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -186,6 +186,20 @@ cdef class Doc:
    def _(self):
        return Underscore(Underscore.doc_extensions, self)

+    @property
+    def is_sentenced(self):
+        # Check if the document has sentence boundaries,
+        # i.e at least one tok has the sent_start in (-1, 1)
+        if 'sents' in self.user_hooks:
+            return True
+        if self.is_parsed:
+            return True
+        for i in range(self.length):
+            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
+                return True
+        else:
+            return False
+
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.

@ -517,29 +531,23 @@ cdef class Doc:
            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
+            if not self.is_sentenced:
+                raise ValueError(
+                    "Sentence boundaries unset. You can add the 'sentencizer' "
+                    "component to the pipeline with: "
+                    "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
+                    "Alternatively, add the dependency parser, or set "
+                    "sentence boundaries by setting doc[i].sent_start")
            if 'sents' in self.user_hooks:
                yield from self.user_hooks['sents'](self)
-                return
-
-            cdef int i
-            if not self.is_parsed:
+            else:
+                start = 0
                for i in range(1, self.length):
-                    if self.c[i].sent_start != 0:
-                        break
-                else:
-                    raise ValueError(
-                        "Sentence boundaries unset. You can add the 'sentencizer' "
-                        "component to the pipeline with: "
-                        "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
-                        "Alternatively, add the dependency parser, or set "
-                        "sentence boundaries by setting doc[i].sent_start")
-            start = 0
-            for i in range(1, self.length):
-                if self.c[i].sent_start == 1:
-                    yield Span(self, start, i)
-                    start = i
-            if start != self.length:
-                yield Span(self, start, self.length)
+                    if self.c[i].sent_start == 1:
+                        yield Span(self, start, i)
+                        start = i
+                if start != self.length:
+                    yield Span(self, start, self.length)

    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
        if self.length == 0:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -285,16 +285,42 @@ cdef class Span:
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
-            # This should raise if we're not parsed.
+            # This should raise if we're not parsed
+            # or doesen't have any sbd component :)
            self.doc.sents
+            # if doc is parsed we can use the deps to find the sentence
+            # otherwise we use the `sent_start` token attribute
            cdef int n = 0
-            root = &self.doc.c[self.start]
-            while root.head != 0:
-                root += root.head
-                n += 1
-                if n >= self.doc.length:
-                    raise RuntimeError
-            return self.doc[root.l_edge:root.r_edge + 1]
+            cdef int i
+            if self.doc.is_parsed:
+                root = &self.doc.c[self.start]
+                n = 0
+                while root.head != 0:
+                    root += root.head
+                    n += 1
+                    if n >= self.doc.length:
+                        raise RuntimeError
+                return self.doc[root.l_edge:root.r_edge + 1]
+            elif self.doc.is_sentenced:
+                # find start of the sentence
+                start = self.start
+                while self.doc.c[start].sent_start != 1 and start > 0:
+                    start += -1
+                # find end of the sentence
+                end = self.end
+                n = 0
+                while end < self.doc.length and self.doc.c[end].sent_start != 1:
+                    end += 1
+                    n += 1
+                    if n >= self.doc.length:
+                        break
+                #
+                return self.doc[start:end]
+            else:
+                raise ValueError(
+                    "Access to sentence requires either the dependency parse "
+                    "or sentence boundaries to be set by setting " +
+                    "doc[i].is_sent_start = True")

    property has_vector:
        """RETURNS (bool): Whether a word vector is associated with the object.
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -34,11 +34,11 @@ cdef class Token:

    @classmethod
    def get_extension(cls, name):
-        return Underscore.token_extensions.get(name)
+        return Underscore.span_extensions.get(name)

    @classmethod
    def has_extension(cls, name):
-        return name in Underscore.token_extensions
+        return name in Underscore.span_extensions

    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.
--- a/spacy/util.py
+++ b/spacy/util.py
@ -442,6 +442,29 @@ def decaying(start, stop, decay):
        nr_upd += 1


+def minibatch_by_words(items, size, count_words=len):
+    '''Create minibatches of a given number of words.'''
+    if isinstance(size, int):
+        size_ = itertools.repeat(size)
+    else:
+        size_ = size
+    items = iter(items)
+    while True:
+        batch_size = next(size_)
+        batch = []
+        while batch_size >= 0:
+            try:
+                doc, gold = next(items)
+            except StopIteration:
+                if batch:
+                    yield batch
+                return
+            batch_size -= count_words(doc)
+            batch.append((doc, gold))
+        if batch:
+            yield batch
+
+
 def itershuffle(iterable, bufsize=1000):
    """Shuffle an iterator. This works by holding `bufsize` items back
    and yielding them sometime later. Obviously, this is not unbiased –
@ -457,7 +480,7 @@ def itershuffle(iterable, bufsize=1000):
    try:
        while True:
            for i in range(random.randint(1, bufsize-len(buf))):
-                buf.append(iterable.next())
+                buf.append(next(iterable))
            random.shuffle(buf)
            for i in range(random.randint(1, bufsize)):
                if buf:
--- a/website/usage/resources.jade
+++ b/website/usage/resources.jade
@ -120,9 +120,6 @@ include ../_includes/_mixins
            |  A Practical Real-World Approach to Gaining Actionable Insights
            |  from your Data

-        +card("Practical Machine Learning with Python", "", "Dipanjan Sarkar et al. (Apress, 2017)", "book")
-            |  A Problem-Solver's Guide to Building Real-World Intelligent Systems
-
 +section("notebooks")
    +h(2, "notebooks") Jupyter notebooks