Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2d, reversing changes made to 92c26a35d4.
2025-10-19 10:14:24 +03:00 · 2018-03-27 19:23:02 +02:00 · 2018-03-27 19:23:02 +02:00 · 1f7229f40f
commit 1f7229f40f
parent 8b7a74570f
67 changed files with 4799 additions and 1040 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -32,7 +32,7 @@ test_script:
  # Note that you must use the environment variable %PYTHON% to refer to
  # the interpreter you're using - Appveyor does not do anything special
  # to put the Python version you want to use on PATH.
-  - "%PYTHON%\\python.exe -m pytest spacy/"
+  - "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
 after_test:
  # This step builds your wheels.
--- a/.buildkite/train.yml
+++ b/.buildkite/train.yml
@ -0,0 +1,11 @@
 steps:
  -
    command: "fab env clean make test wheel"
    label: ":dizzy: :python:"
    artifact_paths: "dist/*.whl"
  - wait
  - trigger: "spacy-train-from-wheel"
    label: ":dizzy: :train:"
    build:
      env:
        SPACY_VERSION: "{$SPACY_VERSION}"
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -182,7 +182,7 @@ If you've made a contribution to spaCy, you should fill in the
 [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
 your contribution can be used across the project. If you agree to be bound by
 the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
-and include it with your pull request, or sumit it separately to
+and include it with your pull request, or submit it separately to
 [`.github/contributors/`](/.github/contributors). The name of the file should be
 your GitHub username, with the extension `.md`. For example, the user
 example_user would create the file `.github/contributors/example_user.md`.
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@ -0,0 +1,392 @@
 '''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
 .conllu format for development data, allowing the official scorer to be used.
 '''
 from __future__ import unicode_literals
 import plac
 import tqdm
 import attr
 from pathlib import Path
 import re
 import sys
 import json
 import spacy
 import spacy.util
 from spacy.tokens import Token, Doc
 from spacy.gold import GoldParse
 from spacy.syntax.nonproj import projectivize
 from collections import defaultdict, Counter
 from timeit import default_timer as timer
 from spacy.matcher import Matcher
 import itertools
 import random
 import numpy.random
 import cytoolz
 import conll17_ud_eval
 import spacy.lang.zh
 import spacy.lang.ja
 spacy.lang.zh.Chinese.Defaults.use_jieba = False
 spacy.lang.ja.Japanese.Defaults.use_janome = False
 random.seed(0)
 numpy.random.seed(0)
 def minibatch_by_words(items, size=5000):
    random.shuffle(items)
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
    while True:
        batch_size = next(size_)
        batch = []
        while batch_size >= 0:
            try:
                doc, gold = next(items)
            except StopIteration:
                if batch:
                    yield batch
                return
            batch_size -= len(doc)
            batch.append((doc, gold))
        if batch:
            yield batch
        else:
            break
 ################
 # Data reading #
 ################
 space_re = re.compile('\s+')
 def split_text(text):
    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
 def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
              max_doc_length=None, limit=None):
    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True.'''
    if not raw_text and not oracle_segments:
        raise ValueError("At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
    conllu = read_conllu(conllu_file)
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    docs = []
    golds = []
    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
        sent_annots = []
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
                if '.' in id_:
                    continue
                if '-' in id_:
                    continue
                id_ = int(id_)-1
                head = int(head)-1 if head != '0' else id_
                sent['words'].append(word)
                sent['tags'].append(tag)
                sent['heads'].append(head)
                sent['deps'].append('ROOT' if dep == 'root' else dep)
                sent['spaces'].append(space_after == '_')
            sent['entities'] = ['-'] * len(sent['words'])
            sent['heads'], sent['deps'] = projectivize(sent['heads'],
                                                       sent['deps'])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
                golds.append(GoldParse(docs[-1], **sent))
            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds
        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
            return docs, golds
    return docs, golds
 def read_conllu(file_):
    docs = []
    sent = []
    doc = []
    for line in file_:
        if line.startswith('# newdoc'):
            if doc:
                docs.append(doc)
            doc = []
        elif line.startswith('#'):
            continue
        elif not line.strip():
            if sent:
                doc.append(sent)
            sent = []
        else:
            sent.append(list(line.strip().split('\t')))
            if len(sent[-1]) != 10:
                print(repr(line))
                raise ValueError
    if sent:
        doc.append(sent)
    if doc:
        docs.append(doc)
    return docs
 def _make_gold(nlp, text, sent_annots):
    # Flatten the conll annotations, and adjust the head indices
    flat = defaultdict(list)
    for sent in sent_annots:
        flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
        for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
            flat[field].extend(sent[field])
    # Construct text if necessary
    assert len(flat['words']) == len(flat['spaces'])
    if text is None:
        text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) 
    doc = nlp.make_doc(text)
    flat.pop('spaces')
    gold = GoldParse(doc, **flat)
    return doc, gold
 #############################
 # Data transforms for spaCy #
 #############################
 def golds_to_gold_tuples(docs, golds):
    '''Get out the annoying 'tuples' format used by begin_training, given the
    GoldParse objects.'''
    tuples = []
    for doc, gold in zip(docs, golds):
        text = doc.text
        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
        sents = [((ids, words, tags, heads, labels, iob), [])]
        tuples.append((text, sents))
    return tuples
 ##############
 # Evaluation #
 ##############
 def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
    with text_loc.open('r', encoding='utf8') as text_file:
        texts = split_text(text_file.read())
        docs = list(nlp.pipe(texts))
    with sys_loc.open('w', encoding='utf8') as out_file:
        write_conllu(docs, out_file)
    with gold_loc.open('r', encoding='utf8') as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
        with sys_loc.open('r', encoding='utf8') as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return scores
 def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start:end+1] for _, start, end in matches]
        offsets = [(span.start_char, span.end_char) for span in spans]
        for start_char, end_char in offsets:
            doc.merge(start_char, end_char)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                file_.write(token._.get_conllu_lines(k) + '\n')
            file_.write('\n')
 def print_progress(itn, losses, ud_scores):
    fields = {
        'dep_loss': losses.get('parser', 0.0),
        'tag_loss': losses.get('tagger', 0.0),
        'words': ud_scores['Words'].f1 * 100,
        'sents': ud_scores['Sentences'].f1 * 100,
        'tags': ud_scores['XPOS'].f1 * 100,
        'uas': ud_scores['UAS'].f1 * 100,
        'las': ud_scores['LAS'].f1 * 100,
    }
    header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
    if itn == 0:
        print('\t'.join(header))
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.1f}',
        '{las:.1f}',
        '{uas:.1f}',
        '{tags:.1f}',
        '{sents:.1f}',
        '{words:.1f}',
    ))
    print(tpl.format(itn, **fields))
 #def get_sent_conllu(sent, sent_id):
 #    lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
 def get_token_conllu(token, i):
    if token._.begins_fused:
        n = 1
        while token.nbor(n)._.inside_fused:
            n += 1
        id_ = '%d-%d' % (i, i+n)
        lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
    else:
        lines = []
    if token.head.i == token.i:
        head = 0
    else:
        head = i + (token.head.i - token.i) + 1
    fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
              str(head), token.dep_.lower(), '_', '_']
    lines.append('\t'.join(fields))
    return '\n'.join(lines)
 Token.set_extension('get_conllu_lines', method=get_token_conllu)
 Token.set_extension('begins_fused', default=False)
 Token.set_extension('inside_fused', default=False)
 ##################
 # Initialization #
 ##################
 def load_nlp(corpus, config):
    lang = corpus.split('_')[0]
    nlp = spacy.blank(lang)
    if config.vectors:
        nlp.vocab.from_disk(config.vectors / 'vocab')
    return nlp
 def initialize_pipeline(nlp, docs, golds, config):
    nlp.add_pipe(nlp.create_pipe('parser'))
    if config.multitask_tag:
        nlp.parser.add_multitask_objective('tag')
    if config.multitask_sent:
        nlp.parser.add_multitask_objective('sent_start')
    nlp.parser.moves.add_action(2, 'subtok')
    nlp.add_pipe(nlp.create_pipe('tagger'))
    for gold in golds:
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    # Replace labels that didn't make the frequency cutoff
    actions = set(nlp.parser.labels)
    label_set = set([act.split('-')[1] for act in actions if '-' in act])
    for gold in golds:
        for i, label in enumerate(gold.labels):
            if label is not None and label not in label_set:
                gold.labels[i] = label.split('||')[0]
    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
 ########################
 # Command line helpers #
 ########################
@attr.s
 class Config(object):
    vectors = attr.ib(default=None)
    max_doc_length = attr.ib(default=10)
    multitask_tag = attr.ib(default=True)
    multitask_sent = attr.ib(default=True)
    nr_epoch = attr.ib(default=30)
    batch_size = attr.ib(default=1000)
    dropout = attr.ib(default=0.2)
    @classmethod
    def load(cls, loc):
        with Path(loc).open('r', encoding='utf8') as file_:
            cfg = json.load(file_)
        return cls(**cfg)
 class Dataset(object):
    def __init__(self, path, section):
        self.path = path
        self.section = section
        self.conllu = None
        self.text = None
        for file_path in self.path.iterdir():
            name = file_path.parts[-1]
            if section in name and name.endswith('conllu'):
                self.conllu = file_path
            elif section in name and name.endswith('txt'):
                self.text = file_path
        if self.conllu is None:
            msg = "Could not find .txt file in {path} for {section}"
            raise IOError(msg.format(section=section, path=path))
        if self.text is None:
            msg = "Could not find .txt file in {path} for {section}"
        self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
 class TreebankPaths(object):
    def __init__(self, ud_path, treebank, **cfg):
        self.train = Dataset(ud_path / treebank, 'train')
        self.dev = Dataset(ud_path / treebank, 'dev')
        self.lang = self.train.lang
@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
    corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
            "positional", None, str),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "positional", None, Config.load),
    limit=("Size limit", "option", "n", int)
 )
 def main(ud_dir, parses_dir, config, corpus, limit=0):
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config)
    docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
                            max_doc_length=config.max_doc_length, limit=limit)
    optimizer = initialize_pipeline(nlp, docs, golds, config)
    for i in range(config.nr_epoch):
        docs = [nlp.make_doc(doc.text) for doc in docs]
        batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
        losses = {}
        n_train_words = sum(len(doc) for doc in docs)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
                batch_docs, batch_gold = zip(*batch)
                pbar.update(sum(len(doc) for doc in batch_docs))
                nlp.update(batch_docs, batch_gold, sgd=optimizer,
                           drop=config.dropout, losses=losses)
        out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
        with nlp.use_params(optimizer.averages):
            scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
            print_progress(i, losses, scores)
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/vectors_tensorboard_standalone.py
+++ b/examples/vectors_tensorboard_standalone.py
@ -1,88 +0,0 @@
 #!/usr/bin/env python
 # coding: utf8
 """Export spaCy model vectors for use in TensorBoard's standalone embedding projector.
 https://github.com/tensorflow/embedding-projector-standalone
 Usage:
 python vectors_tensorboard_standalone.py ./myVectorModel ./output [name]
 This outputs two files that have to be copied into the "oss_data" of the standalone projector:
 [name]_labels.tsv - metadata such as human readable labels for vectors
 [name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors
 """
 from __future__ import unicode_literals
 import json
 import math
 from os import path
 import numpy
 import plac
 import spacy
 import tqdm
@plac.annotations(
    vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
    out_loc=("Path to output folder writing tensors and labels data", "positional", None, str),
    name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
 )
 def main(vectors_loc, out_loc, name="spaCy_vectors"):
    # A tab-separated file that contains information about the vectors for visualization
    #
    # Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata
    meta_file = "{}_labels.tsv".format(name)
    out_meta_file = path.join(out_loc, meta_file)
    print('Loading spaCy vectors model: {}'.format(vectors_loc))
    model = spacy.load(vectors_loc)
    print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
    voacb_strings = [
        w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
        if model.vocab.has_vector(w)
    ]
    vector_count = len(voacb_strings)
    print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file))
    vector_dimensions = model.vocab.vectors.shape[1]
    tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32)
    # Write a tab-separated file that contains information about the vectors for visualization
    #
    # Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
    with open(out_meta_file, 'wb') as file_metadata:
        # Define columns in the first row
        file_metadata.write("Text\tFrequency\n".encode('utf-8'))
        # Write out a row for each vector that we add to the tensorflow variable we created
        vec_index = 0
        for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False):
            # https://github.com/tensorflow/tensorflow/issues/9094
            text = '<Space>' if text.lstrip() == '' else text
            lex = model.vocab[text]
            # Store vector data and metadata
            tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text))
            file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8'))
            vec_index += 1
    # Write out "[name]_tensors.bytes" file for standalone embeddings projector to load
    tensor_path = '{}_tensors.bytes'.format(name)
    tf_vectors_variable.tofile(path.join(out_loc, tensor_path))
    print('Done.')
    print('Add the following entry to "oss_data/oss_demo_projector_config.json"')
    print(json.dumps({
        "tensorName": name,
        "tensorShape": [vector_count, vector_dimensions],
        "tensorPath": 'oss_data/{}'.format(tensor_path),
        "metadataPath": 'oss_data/{}'.format(meta_file)
    }, indent=2))
 if __name__ == '__main__':
    plac.call(main)
--- a/fabfile.py
+++ b/fabfile.py
@ -1,49 +1,92 @@
 # coding: utf-8
 from __future__ import unicode_literals, print_function
 import contextlib
 from pathlib import Path
 from fabric.api import local, lcd, env, settings, prefix
 from fabtools.python import virtualenv
 from os import path, environ
 import shutil
 PWD = path.dirname(__file__)
 ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
-VENV_DIR = path.join(PWD, ENV)
+VENV_DIR = Path(PWD) / ENV
-def env(lang='python2.7'):
+@contextlib.contextmanager
-    if path.exists(VENV_DIR):
+def virtualenv(name, create=False, python='/usr/bin/python3.6'):
    python = Path(python).resolve()
    env_path = VENV_DIR
    if create:
        if env_path.exists():
            shutil.rmtree(str(env_path))
        local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
    def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
        return local('source {}/bin/activate && {}'.format(env_path, cmd),
                     shell='/bin/bash', capture=False)
    yield wrapped_local
 def env(lang='python3.6'):
    if VENV_DIR.exists():
        local('rm -rf {env}'.format(env=VENV_DIR))
-    local('pip install virtualenv')
+    if lang.startswith('python3'):
-    local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
+        local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
    else:
        local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
        local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
    with virtualenv(VENV_DIR) as venv_local:
        print(venv_local('python --version', capture=True))
        venv_local('pip install --upgrade setuptools --no-cache-dir')
        venv_local('pip install pytest --no-cache-dir')
        venv_local('pip install wheel --no-cache-dir')
        venv_local('pip install -r requirements.txt --no-cache-dir')
        venv_local('pip install pex --no-cache-dir')
 def install():
-    with virtualenv(VENV_DIR):
+    with virtualenv(VENV_DIR) as venv_local:
-        local('pip install --upgrade setuptools')
+        venv_local('pip install dist/*.tar.gz')
        local('pip install dist/*.tar.gz')
        local('pip install pytest')
 def make():
    with virtualenv(VENV_DIR):
    with lcd(path.dirname(__file__)):
-            local('pip install cython')
+        local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
-            local('pip install murmurhash')
+            shell='/bin/bash')
            local('pip install -r requirements.txt')
            local('python setup.py build_ext --inplace')
 def sdist():
-    with virtualenv(VENV_DIR):
+    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
            local('python setup.py sdist')
 def wheel():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
            venv_local('python setup.py bdist_wheel')
 def pex():
    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
            sha = local('git rev-parse --short HEAD', capture=True)
            venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
                direct=True)
 def clean():
    with lcd(path.dirname(__file__)):
-        local('python setup.py clean --all')
+        local('rm -f dist/*.whl')
        local('rm -f dist/*.pex')
        with virtualenv(VENV_DIR) as venv_local:
            venv_local('python setup.py clean --all')
 def test():
-    with virtualenv(VENV_DIR):
+    with virtualenv(VENV_DIR) as venv_local:
        with lcd(path.dirname(__file__)):
-            local('py.test -x spacy/tests')
+            venv_local('pytest -x spacy/tests')
 def train():
    args = environ.get('SPACY_TRAIN_ARGS', '')
    with virtualenv(VENV_DIR) as venv_local:
        venv_local('spacy train {args}'.format(args=args))
--- a/requirements.txt
+++ b/requirements.txt
@ -5,8 +5,8 @@ cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
 thinc>=6.11.1.dev10,<6.12.0
 murmurhash>=0.28,<0.29
 cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
 six
 ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.13.0,<3.0.0
@ -16,4 +16,3 @@ pytest>=3.0.6,<4.0.0
 mock>=2.0.0,<3.0.0
 msgpack-python==0.5.4
 msgpack-numpy==0.4.1
 html5lib==1.0b8
--- a/setup.py
+++ b/setup.py
@ -18,6 +18,7 @@ PACKAGES = find_packages()
 MOD_NAMES = [
    'spacy._align',
    'spacy.parts_of_speech',
    'spacy.strings',
    'spacy.lexeme',
@ -191,8 +192,6 @@ def setup_package():
                'preshed>=1.0.0,<2.0.0',
                'thinc>=6.11.1.dev10,<6.12.0',
                'plac<1.0.0,>=0.9.6',
                'six',
                'html5lib==1.0b8',
                'pathlib',
                'ujson>=1.35',
                'dill>=0.2,<0.3',
@ -201,6 +200,7 @@ def setup_package():
                'ftfy>=4.4.2,<5.0.0',
                'msgpack-python==0.5.4',
                'msgpack-numpy==0.4.1'],
            setup_requires=['wheel'],
            classifiers=[
                'Development Status :: 5 - Production/Stable',
                'Environment :: Console',
--- a/spacy/main.py
+++ b/spacy/main.py
@ -8,6 +8,7 @@ if __name__ == '__main__':
    import sys
    from spacy.cli import download, link, info, package, train, convert
    from spacy.cli import vocab, init_model, profile, evaluate, validate
    from spacy.cli import ud_train, ud_evaluate
    from spacy.util import prints
    commands = {
@ -15,7 +16,9 @@ if __name__ == '__main__':
        'link': link,
        'info': info,
        'train': train,
        'ud-train': ud_train,
        'evaluate': evaluate,
        'ud-evaluate': ud_evaluate,
        'convert': convert,
        'package': package,
        'vocab': vocab,
--- a/spacy/_align.pyx
+++ b/spacy/_align.pyx
@ -0,0 +1,251 @@
 # cython: infer_types=True
 '''Do Levenshtein alignment, for evaluation of tokenized input.
 Random notes:
  r i n g
  0 1 2 3 4
 r 1 0 1 2 3
 a 2 1 1 2 3
 n 3 2 2 1 2
 g 4 3 3 2 1
 0,0: (1,1)=min(0+0,1+1,1+1)=0 S
 1,0: (2,1)=min(1+1,0+1,2+1)=1 D
 2,0: (3,1)=min(2+1,3+1,1+1)=2 D
 3,0: (4,1)=min(3+1,4+1,2+1)=3 D
 0,1: (1,2)=min(1+1,2+1,0+1)=1 D
 1,1: (2,2)=min(0+1,1+1,1+1)=1 S
 2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
 3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
 0,2: (1,3)=min(2+1,3+1,1+1)=2 I
 1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
 2,2: (3,3)
 3,2: (4,3)
 At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
 We know the costs to transition:
 S[:i]   -> T[:j]   (at D[i,j])
 S[:i+1] -> T[:j]   (at D[i+1,j])
 S[:i]   -> T[:j+1] (at D[i,j+1])
 Further, we now we can tranform:
 S[:i+1] -> S[:i] (DEL) for 1,
 T[:j+1] -> T[:j] (INS) for 1.
 S[i+1]  -> T[j+1] (SUB) for 0 or 1
 Therefore we have the costs:
 SUB: Cost(S[:i]->T[:j])   + Cost(S[i]->S[j])
 i.e. D[i, j] + S[i+1] != T[j+1]
 INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
 i.e. D[i+1,j] + 1
 DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i]) 
 i.e. D[i,j+1] + 1
    Source string S has length m, with index i
    Target string T has length n, with index j
    Output two alignment vectors: i2j (length m) and j2i (length n)
    # function LevenshteinDistance(char s[1..m], char t[1..n]):
    # for all i and j, d[i,j] will hold the Levenshtein distance between
    # the first i characters of s and the first j characters of t
    # note that d has (m+1)*(n+1) values
    # set each element in d to zero
    ring rang
      - r i n g
    - 0 0 0 0 0
    r 0 0 0 0 0
    a 0 0 0 0 0
    n 0 0 0 0 0
    g 0 0 0 0 0
    # source prefixes can be transformed into empty string by
    # dropping all characters
    # d[i, 0] := i
    ring rang
      - r i n g
    - 0 0 0 0 0
    r 1 0 0 0 0
    a 2 0 0 0 0
    n 3 0 0 0 0
    g 4 0 0 0 0
    # target prefixes can be reached from empty source prefix
    # by inserting every character
    # d[0, j] := j
      - r i n g
    - 0 1 2 3 4
    r 1 0 0 0 0
    a 2 0 0 0 0
    n 3 0 0 0 0
    g 4 0 0 0 0
 '''
 from __future__ import unicode_literals
 from libc.stdint cimport uint32_t
 import numpy
 cimport numpy as np
 from .compat import unicode_
 from murmurhash.mrmr cimport hash32
 def align(S, T):
    cdef int m = len(S)
    cdef int n = len(T)
    cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
    cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
    cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
    cdef np.ndarray S_arr = _convert_sequence(S)
    cdef np.ndarray T_arr = _convert_sequence(T)
    fill_matrix(<int*>matrix.data,
        <const int*>S_arr.data, m, <const int*>T_arr.data, n)
    fill_i2j(i2j, matrix)
    fill_j2i(j2i, matrix)
    for i in range(i2j.shape[0]):
        if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
            i2j[i] = -1
    for j in range(j2i.shape[0]):
        if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
            j2i[j] = -1
    return matrix[-1,-1], i2j, j2i, matrix
 def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
    '''Let's say we had:
    Guess: [aa bb cc dd]
    Truth: [aa bbcc dd]
    i2j: [0, None, -2, 2]
    j2i: [0, -2, 3]
    We want:
    i2j_multi: {1: 1, 2: 1}
    j2i_multi: {}
    '''
    i2j_miss = _get_regions(i2j, i_lengths)
    j2i_miss = _get_regions(j2i, j_lengths)
    i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
    return i2j_multi, j2i_multi
 def _get_regions(alignment, lengths):
    regions = {}
    start = None
    offset = 0
    for i in range(len(alignment)):
        if alignment[i] < 0:
            if start is None:
                start = offset
                regions.setdefault(start, [])
            regions[start].append(i)
        else:
            start = None
        offset += lengths[i]
    return regions
 def _get_mapping(miss1, miss2, lengths1, lengths2):
    i2j = {}
    j2i = {}
    for start, region1 in miss1.items():
        if not region1 or start not in miss2:
            continue
        region2 = miss2[start]
        if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
            j = region2.pop(0)
            buff = []
            # Consume tokens from region 1, until we meet the length of the
            # first token in region2. If we do, align the tokens. If
            # we exceed the length, break.
            while region1:
                buff.append(region1.pop(0))
                if sum(lengths1[i] for i in buff) == lengths2[j]:
                    for i in buff:
                        i2j[i] = j
                    j2i[j] = buff[-1]
                    j += 1
                    buff = []
                elif sum(lengths1[i] for i in buff) > lengths2[j]:
                    break
            else:
                if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
                    for i in buff:
                        i2j[i] = j
                    j2i[j] = buff[-1]
    return i2j, j2i
 def _convert_sequence(seq):
    if isinstance(seq, numpy.ndarray):
        return numpy.ascontiguousarray(seq, dtype='uint32_t')
    cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
    cdef bytes item_bytes
    for i, item in enumerate(seq):
        if isinstance(item, unicode):
            item_bytes = item.encode('utf8')
        else:
            item_bytes = item
        output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
    return output
 cdef void fill_matrix(int* D, 
        const int* S, int m, const int* T, int n) nogil:
    m1 = m+1
    n1 = n+1
    for i in range(m1*n1):
        D[i] = 0
    for i in range(m1):
        D[i*n1] = i
    for j in range(n1):
        D[j] = j
    cdef int sub_cost, ins_cost, del_cost
    for j in range(n):
        for i in range(m):
            i_j = i*n1 + j
            i1_j1 = (i+1)*n1 + j+1
            i1_j = (i+1)*n1 + j
            i_j1 = i*n1 + j+1
            if S[i] != T[j]:
                sub_cost = D[i_j] + 1
            else:
                sub_cost = D[i_j]
            del_cost = D[i_j1] + 1
            ins_cost = D[i1_j] + 1
            best = min(min(sub_cost, ins_cost), del_cost)
            D[i1_j1] = best
 cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
    j = D.shape[1]-2
    cdef int i = D.shape[0]-2
    while i >= 0:
        while D[i+1, j] < D[i+1, j+1]:
            j -= 1
        if D[i, j+1] < D[i+1, j+1]:
            i2j[i] = -1
        else:
            i2j[i] = j
            j -= 1
        i -= 1
 cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
    i = D.shape[0]-2
    cdef int j = D.shape[1]-2
    while j >= 0:
        while D[i, j+1] < D[i+1, j+1]:
            i -= 1
        if D[i+1, j] < D[i+1, j+1]:
            j2i[j] = -1
        else:
            j2i[j] = i
            i -= 1
        j -= 1
--- a/spacy/_matcher2_notes.py
+++ b/spacy/_matcher2_notes.py
@ -0,0 +1,251 @@
 import pytest
 class Vocab(object):
    pass
 class Doc(list):
    def __init__(self, vocab, words=None):
        list.__init__(self)
        self.extend([Token(i, w) for i, w in enumerate(words)])
 class Token(object):
    def __init__(self, i, word):
        self.i = i
        self.text = word
 def find_matches(patterns, doc):
    init_states = [(pattern, 0, None) for pattern in patterns]
    curr_states = []
    matches = []
    for token in doc:
        nexts = []
        for state in (curr_states + init_states):
            matches, nexts = transition(state, token, matches, nexts)
        curr_states = nexts
    return matches
 def transition(state, token, matches, nexts):
    action = get_action(state, token)
    is_match, keep_state, advance_state = [bool(int(c)) for c in action]
    pattern, i, start = state
    if start is None:
        start = token.i
    if is_match:
        matches.append((pattern, start, token.i+1))
    if advance_state:
        nexts.append((pattern, i+1, start))
    if keep_state:
        # TODO: This needs to be zero-width :(.
        nexts.append((pattern, i, start))
    return (matches, nexts)
 def get_action(state, token):
    '''We need to consider:
    a) Does the token match the specification? [Yes, No]
    b) What's the quantifier? [1, 0+, ?]
    c) Is this the last specification? [final, non-final]
    We can transition in the following ways:
    a) Do we emit a match?
    b) Do we add a state with (next state, next token)?
    c) Do we add a state with (next state, same token)?
    d) Do we add a state with (same state, next token)?
    We'll code the actions as boolean strings, so 0000 means no to all 4,
    1000 means match but no states added, etc.
    1:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        0000
      No, non-final
        0000
    0+:
      Yes, final:
        1001
      Yes, non-final:
        0111
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    ?:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    Problem: If a quantifier is matching, we're adding a lot of open partials
    '''
    is_match = get_is_match(state, token)
    operator = get_operator(state, token)
    is_final = get_is_final(state, token)
    raise NotImplementedError
 def get_is_match(state, token):
    pattern, i, start = state
    is_match = token.text == pattern[i]['spec']
    if pattern[i].get('invert'):
        return not is_match
    else:
        return is_match
 def get_is_final(state, token):
    pattern, i, start = state
    return i == len(pattern)-1
 def get_operator(state, token):
    pattern, i, start = state
    return pattern[i].get('op', '1')
 ########################
 # Tests for get_action #
 ########################
 def test_get_action_simple_match():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '100'
 def test_get_action_simple_reject():
    pattern = [{'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '000'
 def test_get_action_simple_match_match():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '100'
 def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'
 def test_get_action_simple_match_reject():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '001'
    state = (pattern, 1, 0)
    action = get_action(state, doc[1])
    assert action == '000'
 def test_get_action_plus_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'
 def test_get_action_plus_match_match():
    pattern = [{'spec': 'a', 'op': '1+'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    state = (pattern, 0, None)
    action = get_action(state, doc[0])
    assert action == '110'
    state = (pattern, 0, 0)
    action = get_action(state, doc[1])
    assert action == '110'
 ##########################
 # Tests for find_matches #
 ##########################
 def test_find_matches_simple_accept():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1)]
 def test_find_matches_simple_reject():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches([pattern], doc)
    assert matches == []
 def test_find_matches_match_twice():
    pattern = [{'spec': 'a', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
 def test_find_matches_longer_pattern():
    pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches([pattern], doc)
    assert matches == [(pattern, 0, 2)]
 def test_find_matches_two_patterns():
    patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
    doc = Doc(Vocab(), words=['a', 'b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
 def test_find_matches_two_patterns_overlap():
    patterns = [[{'spec': 'a'}, {'spec': 'b'}],
                [{'spec': 'b'}, {'spec': 'c'}]]
    doc = Doc(Vocab(), words=['a', 'b', 'c'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
 def test_find_matches_greedy():
    patterns = [[{'spec': 'a', 'op': '1+'}]]
    doc = Doc(Vocab(), words=['a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]
    doc = Doc(Vocab(), words=['a', 'a'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
 def test_find_matches_non_greedy():
    patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
    doc = Doc(Vocab(), words=['b'])
    matches = find_matches(patterns, doc)
    assert matches == [(patterns[0], 0, 1)]
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -64,23 +64,6 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
    return (X, lengths), finish_update
@layerize
 def _logistic(X, drop=0.):
    xp = get_array_module(X)
    if not isinstance(X, xp.ndarray):
        X = xp.asarray(X)
    # Clip to range (-10, 10)
    X = xp.minimum(X, 10., X)
    X = xp.maximum(X, -10., X)
    Y = 1. / (1. + xp.exp(-X))
    def logistic_bwd(dY, sgd=None):
        dX = dY * (Y * (1-Y))
        return dX
    return Y, logistic_bwd
 def _zero_init(model):
    def _zero_init_impl(self, X, y):
        self.W.fill(0)
@ -144,8 +127,8 @@ class PrecomputableAffine(Model):
        self.nF = nF
    def begin_update(self, X, drop=0.):
-        Yf = self.ops.xp.dot(X,
+        Yf = self.ops.gemm(X,
-            self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
+            self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
        Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
        Yf = self._add_padding(Yf)
@ -161,11 +144,11 @@ class PrecomputableAffine(Model):
            Wopfi = self.W.transpose((1, 2, 0, 3))
            Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
            Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
-            dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
+            dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
            # Reuse the buffer
            dWopfi = Wopfi; dWopfi.fill(0.)
-            self.ops.xp.dot(dY.T, Xf, out=dWopfi)
+            self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
            dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
            # (o, p, f, i) --> (f, o, p, i)
            self.d_W += dWopfi.transpose((2, 0, 1, 3))
@ -467,6 +450,7 @@ def SpacyVectors(docs, drop=0.):
 def build_text_classifier(nr_class, width=64, **cfg):
    depth = cfg.get('depth', 2)
    nr_vector = cfg.get('nr_vector', 5000)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
@ -518,7 +502,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
                LN(Maxout(width, vectors_width))
                >> Residual(
                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
-                ) ** 2, pad=2
+                ) ** depth, pad=depth
            )
            >> flatten_add_lengths
            >> ParametricAttention(width)
@ -531,8 +515,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
            _preprocess_doc
            >> LinearModel(nr_class)
        )
        #model = linear_model >> logistic
        model = (
            (linear_model | cnn_model)
            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
--- a/spacy/about.py
+++ b/spacy/about.py
@ -9,7 +9,7 @@ __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
 __email__ = 'contact@explosion.ai'
 __license__ = 'MIT'
-__release__ = True
+__release__ = False
 __docs_models__ = 'https://spacy.io/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -131,7 +131,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
            'NumValue', 'PartType', 'Polite', 'StyleVariant',
            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
-            'Polarity', 'Animacy' # U20
+            'Polarity', 'PrepCase', 'Animacy' # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -9,3 +9,5 @@ from .convert import convert
 from .vocab import make_vocab as vocab
 from .init_model import init_model
 from .validate import validate
 from .ud_train import main as ud_train
 from .conll17_ud_eval import main as ud_evaluate
--- a/spacy/cli/conll17_ud_eval.py
+++ b/spacy/cli/conll17_ud_eval.py
@ -0,0 +1,571 @@
 #!/usr/bin/env python
 # CoNLL 2017 UD Parsing evaluation script.
 #
 # Compatible with Python 2.7 and 3.2+, can be used either as a module
 # or a standalone executable.
 #
 # Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
 # Faculty of Mathematics and Physics, Charles University, Czech Republic.
 #
 # Changelog:
 # - [02 Jan 2017] Version 0.9: Initial release
 # - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
 # - [10 Mar 2017] Version 1.0: Add documentation and test
 #                              Compare HEADs correctly using aligned words
 #                              Allow evaluation with errorneous spaces in forms
 #                              Compare forms in LCS case insensitively
 #                              Detect cycles and multiple root nodes
 #                              Compute AlignedAccuracy
 # Command line usage
 # ------------------
 # conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
 #
 # - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
 #   is printed
 # - if -v is given, several metrics are printed (as precision, recall, F1 score,
 #   and in case the metric is computed on aligned words also accuracy on these):
 #   - Tokens: how well do the gold tokens match system tokens
 #   - Sentences: how well do the gold sentences match system sentences
 #   - Words: how well can the gold words be aligned to system words
 #   - UPOS: using aligned words, how well does UPOS match
 #   - XPOS: using aligned words, how well does XPOS match
 #   - Feats: using aligned words, how well does FEATS match
 #   - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
 #   - Lemmas: using aligned words, how well does LEMMA match
 #   - UAS: using aligned words, how well does HEAD match
 #   - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
 # - if weights_file is given (with lines containing deprel-weight pairs),
 #   one more metric is shown:
 #   - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
 # API usage
 # ---------
 # - load_conllu(file)
 #   - loads CoNLL-U file from given file object to an internal representation
 #   - the file object should return str on both Python 2 and Python 3
 #   - raises UDError exception if the given file cannot be loaded
 # - evaluate(gold_ud, system_ud)
 #   - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
 #   - raises UDError if the concatenated tokens of gold and system file do not match
 #   - returns a dictionary with the metrics described above, each metrics having
 #     three fields: precision, recall and f1
 # Description of token matching
 # -----------------------------
 # In order to match tokens of gold file and system file, we consider the text
 # resulting from concatenation of gold tokens and text resulting from
 # concatenation of system tokens. These texts should match -- if they do not,
 # the evaluation fails.
 #
 # If the texts do match, every token is represented as a range in this original
 # text, and tokens are equal only if their range is the same.
 # Description of word matching
 # ----------------------------
 # When matching words of gold file and system file, we first match the tokens.
 # The words which are also tokens are matched as tokens, but words in multi-word
 # tokens have to be handled differently.
 #
 # To handle multi-word tokens, we start by finding "multi-word spans".
 # Multi-word span is a span in the original text such that
 # - it contains at least one multi-word token
 # - all multi-word tokens in the span (considering both gold and system ones)
 #   are completely inside the span (i.e., they do not "stick out")
 # - the multi-word span is as small as possible
 #
 # For every multi-word span, we align the gold and system words completely
 # inside this span using LCS on their FORMs. The words not intersecting
 # (even partially) any multi-word span are then aligned as tokens.
 from __future__ import division
 from __future__ import print_function
 import argparse
 import io
 import sys
 import unittest
 # CoNLL-U column names
 ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
 # UD Error is used when raising exceptions in this module
 class UDError(Exception):
    pass
 # Load given CoNLL-U file into internal representation
 def load_conllu(file):
    # Internal representation classes
    class UDRepresentation:
        def __init__(self):
            # Characters of all the tokens in the whole file.
            # Whitespace between tokens is not included.
            self.characters = []
            # List of UDSpan instances with start&end indices into `characters`.
            self.tokens = []
            # List of UDWord instances.
            self.words = []
            # List of UDSpan instances with start&end indices into `characters`.
            self.sentences = []
    class UDSpan:
        def __init__(self, start, end, characters):
            self.start = start
            # Note that self.end marks the first position **after the end** of span,
            # so we can use characters[start:end] or range(start, end).
            self.end = end
            self.characters = characters
        @property
        def text(self):
            return ''.join(self.characters[self.start:self.end])
        def __str__(self):
            return self.text
        def __repr__(self):
            return self.text
    class UDWord:
        def __init__(self, span, columns, is_multiword):
            # Span of this word (or MWT, see below) within ud_representation.characters.
            self.span = span
            # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
            self.columns = columns
            # is_multiword==True means that this word is part of a multi-word token.
            # In that case, self.span marks the span of the whole multi-word token.
            self.is_multiword = is_multiword
            # Reference to the UDWord instance representing the HEAD (or None if root).
            self.parent = None
            # Let's ignore language-specific deprel subtypes.
            self.columns[DEPREL] = columns[DEPREL].split(':')[0]
    ud = UDRepresentation()
    # Load the CoNLL-U file
    index, sentence_start = 0, None
    linenum = 0
    while True:
        line = file.readline()
        linenum += 1
        if not line:
            break
        line = line.rstrip("\r\n")
        # Handle sentence start boundaries
        if sentence_start is None:
            # Skip comments
            if line.startswith("#"):
                continue
            # Start a new sentence
            ud.sentences.append(UDSpan(index, 0, ud.characters))
            sentence_start = len(ud.words)
        if not line:
            # Add parent UDWord links and check there are no cycles
            def process_word(word):
                if word.parent == "remapping":
                    raise UDError("There is a cycle in a sentence")
                if word.parent is None:
                    head = int(word.columns[HEAD])
                    if head > len(ud.words) - sentence_start:
                        raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
                            linenum, word.columns[HEAD]))
                    if head:
                        parent = ud.words[sentence_start + head - 1]
                        word.parent = "remapping"
                        process_word(parent)
                        word.parent = parent
            for word in ud.words[sentence_start:]:
                process_word(word)
            # Check there is a single root node
            if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
                raise UDError("There are multiple roots in a sentence")
            # End the sentence
            ud.sentences[-1].end = index
            sentence_start = None
            continue
        # Read next token/word
        columns = line.split("\t")
        if len(columns) != 10:
            raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
        # Skip empty nodes
        if "." in columns[ID]:
            continue
        # Delete spaces from FORM  so gold.characters == system.characters
        # even if one of them tokenizes the space.
        columns[FORM] = columns[FORM].replace(" ", "")
        if not columns[FORM]:
            raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
        # Save token
        ud.characters.extend(columns[FORM])
        ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
        index += len(columns[FORM])
        # Handle multi-word tokens to save word(s)
        if "-" in columns[ID]:
            try:
                start, end = map(int, columns[ID].split("-"))
            except:
                raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
            for _ in range(start, end + 1):
                word_line = file.readline().rstrip("\r\n")
                word_columns = word_line.split("\t")
                if len(word_columns) != 10:
                    print(columns)
                    raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
                ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
        # Basic tokens/words
        else:
            try:
                word_id = int(columns[ID])
            except:
                raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
            if word_id != len(ud.words) - sentence_start + 1:
                raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
            try:
                head_id = int(columns[HEAD])
            except:
                raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
            if head_id < 0:
                raise UDError("HEAD cannot be negative")
            ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
    if sentence_start is not None:
        raise UDError("The CoNLL-U file does not end with empty line")
    return ud
 # Evaluate the gold and system treebanks (loaded using load_conllu).
 def evaluate(gold_ud, system_ud, deprel_weights=None):
    class Score:
        def __init__(self, gold_total, system_total, correct, aligned_total=None):
            self.precision = correct / system_total if system_total else 0.0
            self.recall = correct / gold_total if gold_total else 0.0
            self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
            self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
    class AlignmentWord:
        def __init__(self, gold_word, system_word):
            self.gold_word = gold_word
            self.system_word = system_word
            self.gold_parent = None
            self.system_parent_gold_aligned = None
    class Alignment:
        def __init__(self, gold_words, system_words):
            self.gold_words = gold_words
            self.system_words = system_words
            self.matched_words = []
            self.matched_words_map = {}
        def append_aligned_words(self, gold_word, system_word):
            self.matched_words.append(AlignmentWord(gold_word, system_word))
            self.matched_words_map[system_word] = gold_word
        def fill_parents(self):
            # We represent root parents in both gold and system data by '0'.
            # For gold data, we represent non-root parent by corresponding gold word.
            # For system data, we represent non-root parent by either gold word aligned
            # to parent system nodes, or by None if no gold words is aligned to the parent.
            for words in self.matched_words:
                words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
                words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
                    if words.system_word.parent is not None else 0
    def lower(text):
        if sys.version_info < (3, 0) and isinstance(text, str):
            return text.decode("utf-8").lower()
        return text.lower()
    def spans_score(gold_spans, system_spans):
        correct, gi, si = 0, 0, 0
        while gi < len(gold_spans) and si < len(system_spans):
            if system_spans[si].start < gold_spans[gi].start:
                si += 1
            elif gold_spans[gi].start < system_spans[si].start:
                gi += 1
            else:
                correct += gold_spans[gi].end == system_spans[si].end
                si += 1
                gi += 1
        return Score(len(gold_spans), len(system_spans), correct)
    def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
        gold, system, aligned, correct = 0, 0, 0, 0
        for word in alignment.gold_words:
            gold += weight_fn(word)
        for word in alignment.system_words:
            system += weight_fn(word)
        for words in alignment.matched_words:
            aligned += weight_fn(words.gold_word)
        if key_fn is None:
            # Return score for whole aligned words
            return Score(gold, system, aligned)
        for words in alignment.matched_words:
            if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
                correct += weight_fn(words.gold_word)
        return Score(gold, system, correct, aligned)
    def beyond_end(words, i, multiword_span_end):
        if i >= len(words):
            return True
        if words[i].is_multiword:
            return words[i].span.start >= multiword_span_end
        return words[i].span.end > multiword_span_end
    def extend_end(word, multiword_span_end):
        if word.is_multiword and word.span.end > multiword_span_end:
            return word.span.end
        return multiword_span_end
    def find_multiword_span(gold_words, system_words, gi, si):
        # We know gold_words[gi].is_multiword or system_words[si].is_multiword.
        # Find the start of the multiword span (gs, ss), so the multiword span is minimal.
        # Initialize multiword_span_end characters index.
        if gold_words[gi].is_multiword:
            multiword_span_end = gold_words[gi].span.end
            if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
                si += 1
        else: # if system_words[si].is_multiword
            multiword_span_end = system_words[si].span.end
            if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
                gi += 1
        gs, ss = gi, si
        # Find the end of the multiword span
        # (so both gi and si are pointing to the word following the multiword span end).
        while not beyond_end(gold_words, gi, multiword_span_end) or \
              not beyond_end(system_words, si, multiword_span_end):
            if gi < len(gold_words) and (si >= len(system_words) or
                                         gold_words[gi].span.start <= system_words[si].span.start):
                multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
                gi += 1
            else:
                multiword_span_end = extend_end(system_words[si], multiword_span_end)
                si += 1
        return gs, ss, gi, si
    def compute_lcs(gold_words, system_words, gi, si, gs, ss):
        lcs = [[0] * (si - ss) for i in range(gi - gs)]
        for g in reversed(range(gi - gs)):
            for s in reversed(range(si - ss)):
                if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
                    lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
                lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
                lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
        return lcs
    def align_words(gold_words, system_words):
        alignment = Alignment(gold_words, system_words)
        gi, si = 0, 0
        while gi < len(gold_words) and si < len(system_words):
            if gold_words[gi].is_multiword or system_words[si].is_multiword:
                # A: Multi-word tokens => align via LCS within the whole "multiword span".
                gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
                if si > ss and gi > gs:
                    lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
                    # Store aligned words
                    s, g = 0, 0
                    while g < gi - gs and s < si - ss:
                        if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
                            alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
                            g += 1
                            s += 1
                        elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
                            g += 1
                        else:
                            s += 1
            else:
                # B: No multi-word token => align according to spans.
                if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
                    alignment.append_aligned_words(gold_words[gi], system_words[si])
                    gi += 1
                    si += 1
                elif gold_words[gi].span.start <= system_words[si].span.start:
                    gi += 1
                else:
                    si += 1
        alignment.fill_parents()
        return alignment
    # Check that underlying character sequences do match
    if gold_ud.characters != system_ud.characters:
        index = 0
        while gold_ud.characters[index] == system_ud.characters[index]:
            index += 1
        raise UDError(
            "The concatenation of tokens in gold file and in system file differ!\n" +
            "First 20 differing characters in gold file: '{}' and system file: '{}'".format(
                "".join(gold_ud.characters[index:index + 20]),
                "".join(system_ud.characters[index:index + 20])
            )
        )
    # Align words
    alignment = align_words(gold_ud.words, system_ud.words)
    # Compute the F1-scores
    result = {
        "Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
        "Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
        "Words": alignment_score(alignment, None),
        "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
        "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
        "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
        "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
        "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
        "UAS": alignment_score(alignment, lambda w, parent: parent),
        "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
    }
    # Add WeightedLAS if weights are given
    if deprel_weights is not None:
        def weighted_las(word):
            return deprel_weights.get(word.columns[DEPREL], 1.0)
        result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
    return result
 def load_deprel_weights(weights_file):
    if weights_file is None:
        return None
    deprel_weights = {}
    for line in weights_file:
        # Ignore comments and empty lines
        if line.startswith("#") or not line.strip():
            continue
        columns = line.rstrip("\r\n").split()
        if len(columns) != 2:
            raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
        deprel_weights[columns[0]] = float(columns[1])
    return deprel_weights
 def load_conllu_file(path):
    _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
    return load_conllu(_file)
 def evaluate_wrapper(args):
    # Load CoNLL-U files
    gold_ud = load_conllu_file(args.gold_file)
    system_ud = load_conllu_file(args.system_file)
    # Load weights if requested
    deprel_weights = load_deprel_weights(args.weights)
    return evaluate(gold_ud, system_ud, deprel_weights)
 def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("gold_file", type=str,
                        help="Name of the CoNLL-U file with the gold data.")
    parser.add_argument("system_file", type=str,
                        help="Name of the CoNLL-U file with the predicted data.")
    parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
                        metavar="deprel_weights_file",
                        help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
    parser.add_argument("--verbose", "-v", default=0, action="count",
                        help="Print all metrics.")
    args = parser.parse_args()
    # Use verbose if weights are supplied
    if args.weights is not None and not args.verbose:
        args.verbose = 1
    # Evaluate
    evaluation = evaluate_wrapper(args)
    # Print the evaluation
    if not args.verbose:
        print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
    else:
        metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
        if args.weights is not None:
            metrics.append("WeightedLAS")
        print("Metrics    | Precision |    Recall |  F1 Score | AligndAcc")
        print("-----------+-----------+-----------+-----------+-----------")
        for metric in metrics:
            print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
                metric,
                100 * evaluation[metric].precision,
                100 * evaluation[metric].recall,
                100 * evaluation[metric].f1,
                "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
            ))
 if __name__ == "__main__":
    main()
 # Tests, which can be executed with `python -m unittest conll17_ud_eval`.
 class TestAlignment(unittest.TestCase):
    @staticmethod
    def _load_words(words):
        """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
        lines, num_words = [], 0
        for w in words:
            parts = w.split(" ")
            if len(parts) == 1:
                num_words += 1
                lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
            else:
                lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
                for part in parts[1:]:
                    num_words += 1
                    lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
        return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
    def _test_exception(self, gold, system):
        self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
    def _test_ok(self, gold, system, correct):
        metrics = evaluate(self._load_words(gold), self._load_words(system))
        gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
        system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
        self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
                         (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
    def test_exception(self):
        self._test_exception(["a"], ["b"])
    def test_equal(self):
        self._test_ok(["a"], ["a"], 1)
        self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
    def test_equal_with_multiword(self):
        self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
        self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
        self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
        self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
    def test_alignment(self):
        self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
        self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
        self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
        self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
        self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
        self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
        self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -8,8 +8,8 @@ from thinc.neural._classes.model import Model
 from timeit import default_timer as timer
 from ..attrs import PROB, IS_OOV, CLUSTER, LANG
-from ..gold import GoldCorpus, minibatch
+from ..gold import GoldCorpus
-from ..util import prints
+from ..util import prints, minibatch, minibatch_by_words
 from .. import util
 from .. import about
 from .. import displacy
@ -51,8 +51,6 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title="Training data not found", exits=1)
    if dev_path and not dev_path.exists():
@ -66,6 +64,13 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    meta.setdefault('lang', lang)
    meta.setdefault('name', 'unnamed')
    if not output_path.exists():
        output_path.mkdir()
    print("Counting training words (limit=%s" % n_sents)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()
    print(n_train_words)
    pipeline = ['tagger', 'parser', 'ner']
    if no_tagger and 'tagger' in pipeline:
        pipeline.remove('tagger')
@ -81,13 +86,9 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
-    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
+    batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
-                                   util.env_opt('batch_to', 16),
+                                   util.env_opt('batch_to', 1000),
                                   util.env_opt('batch_compound', 1.001))
    max_doc_len = util.env_opt('max_doc_len', 5000)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()
    lang_class = util.get_lang_class(lang)
    nlp = lang_class()
    meta['pipeline'] = pipeline
@ -105,6 +106,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
            lex.is_oov = False
    for name in pipeline:
        nlp.add_pipe(nlp.create_pipe(name), name=name)
    nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
    if parser_multitasks:
        for objective in parser_multitasks.split(','):
            nlp.parser.add_multitask_objective(objective)
@ -116,21 +118,20 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
    try:
        train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
                                       gold_preproc=gold_preproc, max_length=0)
        train_docs = list(train_docs)
        for i in range(n_iter):
            train_docs = corpus.train_docs(nlp, noise_level=0.0,
                                           gold_preproc=gold_preproc, max_length=0)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
-                for batch in minibatch(train_docs, size=batch_sizes):
+                for batch in minibatch_by_words(train_docs, size=batch_sizes):
                    batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))
-
+                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import six
 import ftfy
 import sys
 import ujson
@ -47,8 +46,9 @@ is_windows = sys.platform.startswith('win')
 is_linux = sys.platform.startswith('linux')
 is_osx = sys.platform == 'darwin'
-is_python2 = six.PY2
+# See: https://github.com/benjaminp/six/blob/master/six.py
-is_python3 = six.PY3
+is_python2 = sys.version_info[0] == 2
 is_python3 = sys.version_info[0] == 3
 is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
 if is_python2:
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -3,16 +3,25 @@
 from __future__ import unicode_literals, print_function
 import re
 import ujson
 import random
 import cytoolz
 import itertools
 import numpy
 import tempfile
 import shutil
 from pathlib import Path
 import msgpack
 import ujson
 from . import _align 
 from .syntax import nonproj
 from .tokens import Doc
 from . import util
-from .util import minibatch
+from .util import minibatch, itershuffle
 from .compat import json_dumps
 from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
 def tags_to_entities(tags):
    entities = []
@ -59,196 +68,62 @@ def merge_sents(sents):
    return [(m_deps, m_brackets)]
 def align(cand_words, gold_words):
    cost, edit_path = _min_edit_path(cand_words, gold_words)
    alignment = []
    i_of_gold = 0
    for move in edit_path:
        if move == 'M':
            alignment.append(i_of_gold)
            i_of_gold += 1
        elif move == 'S':
            alignment.append(None)
            i_of_gold += 1
        elif move == 'D':
            alignment.append(None)
        elif move == 'I':
            i_of_gold += 1
        else:
            raise Exception(move)
    return alignment
 punct_re = re.compile(r'\W')
-
+def align(cand_words, gold_words):
 def _min_edit_path(cand_words, gold_words):
    cdef:
        Pool mem
        int i, j, n_cand, n_gold
        int* curr_costs
        int* prev_costs
    # TODO: Fix this --- just do it properly, make the full edit matrix and
    # then walk back over it...
    # Preprocess inputs
    cand_words = [punct_re.sub('', w).lower() for w in cand_words]
    gold_words = [punct_re.sub('', w).lower() for w in gold_words]
    if cand_words == gold_words:
-        return 0, ''.join(['M' for _ in gold_words])
+        alignment = numpy.arange(len(cand_words))
-    mem = Pool()
+        return 0, alignment, alignment, {}, {}
-    n_cand = len(cand_words)
+    cand_words = [w.replace(' ', '') for w in cand_words]
-    n_gold = len(gold_words)
+    gold_words = [w.replace(' ', '') for w in gold_words]
-    # Levenshtein distance, except we need the history, and we may want
+    cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
-    # different costs. Mark operations with a string, and score the history
+    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
-    # using _edit_cost.
+                                [len(w) for w in gold_words])
-    previous_row = []
+    for i, j in list(i2j_multi.items()):
-    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
+        if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
-    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
+            i2j[i] = j
-    for i in range(n_gold + 1):
+            i2j_multi.pop(i)
-        cell = ''
+    for j, i in list(j2i_multi.items()):
-        for j in range(i):
+        if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
-            cell += 'I'
+            j2i[j] = i
-        previous_row.append('I' * i)
+            j2i_multi.pop(j)
-        prev_costs[i] = i
+    return cost, i2j, j2i, i2j_multi, j2i_multi
    for i, cand in enumerate(cand_words):
        current_row = ['D' * (i + 1)]
        curr_costs[0] = i+1
        for j, gold in enumerate(gold_words):
            if gold.lower() == cand.lower():
                s_cost = prev_costs[j]
                i_cost = curr_costs[j] + 1
                d_cost = prev_costs[j + 1] + 1
            else:
                s_cost = prev_costs[j] + 1
                i_cost = curr_costs[j] + 1
                d_cost = prev_costs[j + 1] + (1 if cand else 0)
            if s_cost <= i_cost and s_cost <= d_cost:
                best_cost = s_cost
                best_hist = previous_row[j] + ('M' if gold == cand else 'S')
            elif i_cost <= s_cost and i_cost <= d_cost:
                best_cost = i_cost
                best_hist = current_row[j] + 'I'
            else:
                best_cost = d_cost
                best_hist = previous_row[j + 1] + 'D'
            current_row.append(best_hist)
            curr_costs[j+1] = best_cost
        previous_row = current_row
        for j in range(len(gold_words) + 1):
            prev_costs[j] = curr_costs[j]
            curr_costs[j] = 0
    return prev_costs[n_gold], previous_row[-1]
 class GoldCorpus(object):
    """An annotated corpus, using the JSON file format. Manages
    annotations for tagging, dependency parsing and NER."""
-    def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
+    def __init__(self, train, dev, gold_preproc=False, limit=None):
        """Create a GoldCorpus.
        train_path (unicode or Path): File or directory of training data.
        dev_path (unicode or Path): File or directory of development data.
        RETURNS (GoldCorpus): The newly created object.
        """
        self.train_path = util.ensure_path(train_path)
        self.dev_path = util.ensure_path(dev_path)
        self.limit = limit
-        self.train_locs = self.walk_corpus(self.train_path)
+        if isinstance(train, str) or isinstance(train, Path):
-        self.dev_locs = self.walk_corpus(self.dev_path)
+            train = self.read_tuples(self.walk_corpus(train))
            dev = self.read_tuples(self.walk_corpus(dev))
-    @property
+        # Write temp directory with one doc per file, so we can shuffle
-    def train_tuples(self):
+        # and stream
-        i = 0
+        self.tmp_dir = Path(tempfile.mkdtemp())
-        for loc in self.train_locs:
+        self.write_msgpack(self.tmp_dir / 'train', train)
-            gold_tuples = read_json_file(loc)
+        self.write_msgpack(self.tmp_dir / 'dev', dev)
            for item in gold_tuples:
                yield item
                i += len(item[1])
                if self.limit and i >= self.limit:
                    break
-    @property
+    def __del__(self):
-    def dev_tuples(self):
+        shutil.rmtree(self.tmp_dir)
        i = 0
        for loc in self.dev_locs:
            gold_tuples = read_json_file(loc)
            for item in gold_tuples:
                yield item
                i += len(item[1])
                if self.limit and i >= self.limit:
                    break
-    def count_train(self):
+    @staticmethod
-        n = 0
+    def write_msgpack(directory, doc_tuples):
-        i = 0
+        if not directory.exists():
-        for raw_text, paragraph_tuples in self.train_tuples:
+            directory.mkdir()
-            n += sum([len(s[0][1]) for s in paragraph_tuples])
+        for i, doc_tuple in enumerate(doc_tuples):
-            if self.limit and i >= self.limit:
+            with open(directory / '{}.msg'.format(i), 'wb') as file_:
-                break
+                msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8')
            i += len(paragraph_tuples)
        return n
    def train_docs(self, nlp, gold_preproc=False,
                   projectivize=False, max_length=None,
                   noise_level=0.0):
        train_tuples = self.train_tuples
        if projectivize:
            train_tuples = nonproj.preprocess_training_data(
                self.train_tuples, label_freq_cutoff=100)
        random.shuffle(train_tuples)
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                        max_length=max_length,
                                        noise_level=noise_level)
        yield from gold_docs
    def dev_docs(self, nlp, gold_preproc=False):
        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
        yield from gold_docs
    @classmethod
    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
                       noise_level=0.0):
        for raw_text, paragraph_tuples in tuples:
            if gold_preproc:
                raw_text = None
            else:
                paragraph_tuples = merge_sents(paragraph_tuples)
            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
                                  gold_preproc, noise_level=noise_level)
            golds = cls._make_golds(docs, paragraph_tuples)
            for doc, gold in zip(docs, golds):
                if (not max_length) or len(doc) < max_length:
                    yield doc, gold
    @classmethod
    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
                   noise_level=0.0):
        if raw_text is not None:
            raw_text = add_noise(raw_text, noise_level)
            return [nlp.make_doc(raw_text)]
        else:
            return [Doc(nlp.vocab,
                        words=add_noise(sent_tuples[1], noise_level))
                    for (sent_tuples, brackets) in paragraph_tuples]
    @classmethod
    def _make_golds(cls, docs, paragraph_tuples):
        assert len(docs) == len(paragraph_tuples)
        if len(docs) == 1:
            return [GoldParse.from_annot_tuples(docs[0],
                                                paragraph_tuples[0][0])]
        else:
            return [GoldParse.from_annot_tuples(doc, sent_tuples)
                    for doc, (sent_tuples, brackets)
                    in zip(docs, paragraph_tuples)]
    @staticmethod
    def walk_corpus(path):
        path = util.ensure_path(path)
        if not path.is_dir():
            return [path]
        paths = [path]
@ -266,6 +141,101 @@ class GoldCorpus(object):
                locs.append(path)
        return locs
    @staticmethod
    def read_tuples(locs, limit=0):
        i = 0
        for loc in locs:
            loc = util.ensure_path(loc)
            if loc.parts[-1].endswith('json'):
                gold_tuples = read_json_file(loc)
            elif loc.parts[-1].endswith('msg'):
                with loc.open('rb') as file_:
                    gold_tuples = msgpack.load(file_, encoding='utf8')
            else:
                msg = "Cannot read from file: %s. Supported formats: .json, .msg"
                raise ValueError(msg % loc)
            for item in gold_tuples:
                yield item
                i += len(item[1])
                if limit and i >= limit:
                    break
    @property
    def dev_tuples(self):
        locs = (self.tmp_dir / 'dev').iterdir()
        yield from self.read_tuples(locs, limit=self.limit)
    @property
    def train_tuples(self):
        locs = (self.tmp_dir / 'train').iterdir()
        yield from self.read_tuples(locs, limit=self.limit)
    def count_train(self):
        n = 0
        i = 0
        for raw_text, paragraph_tuples in self.train_tuples:
            for sent_tuples, brackets in paragraph_tuples:
                n += len(sent_tuples[1])
            if self.limit and i >= self.limit:
                break
            i += len(paragraph_tuples)
        return n
    def train_docs(self, nlp, gold_preproc=False, max_length=None,
                    noise_level=0.0):
        locs = list((self.tmp_dir / 'train').iterdir())
        random.shuffle(locs)
        train_tuples = self.read_tuples(locs, limit=self.limit)
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                        max_length=max_length,
                                        noise_level=noise_level,
                                        make_projective=True)
        yield from gold_docs
    def dev_docs(self, nlp, gold_preproc=False):
        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples,
                                        gold_preproc=gold_preproc)
        yield from gold_docs
    @classmethod
    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
                       noise_level=0.0, make_projective=False):
        for raw_text, paragraph_tuples in tuples:
            if gold_preproc:
                raw_text = None
            else:
                paragraph_tuples = merge_sents(paragraph_tuples)
            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
                                  gold_preproc, noise_level=noise_level)
            golds = cls._make_golds(docs, paragraph_tuples, make_projective)
            for doc, gold in zip(docs, golds):
                if (not max_length) or len(doc) < max_length:
                    yield doc, gold
    @classmethod
    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
                   noise_level=0.0):
        if raw_text is not None:
            raw_text = add_noise(raw_text, noise_level)
            return [nlp.make_doc(raw_text)]
        else:
            return [Doc(nlp.vocab,
                        words=add_noise(sent_tuples[1], noise_level))
                    for (sent_tuples, brackets) in paragraph_tuples]
    @classmethod
    def _make_golds(cls, docs, paragraph_tuples, make_projective):
        assert len(docs) == len(paragraph_tuples)
        if len(docs) == 1:
            return [GoldParse.from_annot_tuples(docs[0],
                                                paragraph_tuples[0][0],
                                                make_projective=make_projective)]
        else:
            return [GoldParse.from_annot_tuples(doc, sent_tuples,
                                                make_projective=make_projective)
                    for doc, (sent_tuples, brackets)
                    in zip(docs, paragraph_tuples)]
 def add_noise(orig, noise_level):
    if random.random() >= noise_level:
@ -297,11 +267,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
        for filename in loc.iterdir():
            yield from read_json_file(loc / filename, limit=limit)
    else:
-        with loc.open('r', encoding='utf8') as file_:
+        for doc in _json_iterate(loc):
            docs = ujson.load(file_)
        if limit is not None:
            docs = docs[:limit]
        for doc in docs:
            if docs_filter is not None and not docs_filter(doc):
                continue
            paragraphs = []
@ -331,6 +297,56 @@ def read_json_file(loc, docs_filter=None, limit=None):
                    yield [paragraph.get('raw', None), sents]
 def _json_iterate(loc):
    # We should've made these files jsonl...But since we didn't, parse out
    # the docs one-by-one to reduce memory usage.
    # It's okay to read in the whole file -- just don't parse it into JSON.
    cdef bytes py_raw
    loc = util.ensure_path(loc)
    with loc.open('rb') as file_:
        py_raw = file_.read()
    raw = <char*>py_raw
    cdef int square_depth = 0
    cdef int curly_depth = 0
    cdef int inside_string = 0
    cdef int escape = 0
    cdef int start = -1
    cdef char c
    cdef char quote = ord('"')
    cdef char backslash = ord('\\')
    cdef char open_square = ord('[')
    cdef char close_square = ord(']')
    cdef char open_curly = ord('{')
    cdef char close_curly = ord('}')
    for i in range(len(py_raw)):
        c = raw[i]
        if c == backslash:
            escape = True
            continue
        if escape:
            escape = False
            continue
        if c == quote:
            inside_string = not inside_string
            continue
        if inside_string:
            continue
        if c == open_square:
            square_depth += 1
        elif c == close_square:
            square_depth -= 1
        elif c == open_curly:
            if square_depth == 1 and curly_depth == 0:
                start = i
            curly_depth += 1
        elif c == close_curly:
            curly_depth -= 1
            if square_depth == 1 and curly_depth == 0:
                py_str = py_raw[start : i+1].decode('utf8')
                yield ujson.loads(py_str)
                start = -1
 def iob_to_biluo(tags):
    out = []
    curr_label = None
@ -434,8 +450,21 @@ cdef class GoldParse:
        self.labels = [None] * len(doc)
        self.ner = [None] * len(doc)
-        self.cand_to_gold = align([t.orth_ for t in doc], words)
+        # This needs to be done before we align the words
-        self.gold_to_cand = align(words, [t.orth_ for t in doc])
+        if make_projective and heads is not None and deps is not None:
            heads, deps = nonproj.projectivize(heads, deps)
        # Do many-to-one alignment for misaligned tokens.
        # If we over-segment, we'll have one gold word that covers a sequence
        # of predicted words
        # If we under-segment, we'll have one predicted word that covers a
        # sequence of gold words.
        # If we "mis-segment", we'll have a sequence of predicted words covering
        # a sequence of gold words. That's many-to-many -- we don't do that.
        cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
        self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
        self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
        annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
        self.orig_annot = list(zip(*annot_tuples))
@ -443,12 +472,47 @@ cdef class GoldParse:
        for i, gold_i in enumerate(self.cand_to_gold):
            if doc[i].text.isspace():
                self.words[i] = doc[i].text
-                self.tags[i] = 'SP'
+                self.tags[i] = '_SP'
                self.heads[i] = None
                self.labels[i] = None
                self.ner[i] = 'O'
            if gold_i is None:
-                pass
+                if i in i2j_multi:
                    self.words[i] = words[i2j_multi[i]]
                    self.tags[i] = tags[i2j_multi[i]]
                    is_last = i2j_multi[i] != i2j_multi.get(i+1)
                    is_first = i2j_multi[i] != i2j_multi.get(i-1)
                    # Set next word in multi-token span as head, until last
                    if not is_last:
                        self.heads[i] = i+1
                        self.labels[i] = 'subtok'
                    else:
                        self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
                        self.labels[i] = deps[i2j_multi[i]]
                    # Now set NER...This is annoying because if we've split
                    # got an entity word split into two, we need to adjust the
                    # BILOU tags. We can't have BB or LL etc.
                    # Case 1: O -- easy.
                    ner_tag = entities[i2j_multi[i]]
                    if ner_tag == 'O':
                        self.ner[i] = 'O'
                    # Case 2: U. This has to become a B I* L sequence.
                    elif ner_tag.startswith('U-'):
                        if is_first:
                            self.ner[i] = ner_tag.replace('U-', 'B-', 1)
                        elif is_last:
                            self.ner[i] = ner_tag.replace('U-', 'L-', 1)
                        else:
                            self.ner[i] = ner_tag.replace('U-', 'I-', 1)
                    # Case 3: L. If not last, change to I.
                    elif ner_tag.startswith('L-'):
                        if is_last:
                            self.ner[i] = ner_tag
                        else:
                            self.ner[i] = ner_tag.replace('L-', 'I-', 1)
                    # Case 4: I. Stays correct
                    elif ner_tag.startswith('I-'):
                        self.ner[i] = ner_tag
            else:
                self.words[i] = words[gold_i]
                self.tags[i] = tags[gold_i]
@ -463,10 +527,6 @@ cdef class GoldParse:
        if cycle is not None:
            raise Exception("Cycle found: %s" % cycle)
        if make_projective:
            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
            self.heads = proj_heads
    def __len__(self):
        """Get the number of gold-standard tokens.
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
 must my myself
 name namely neither never nevertheless next nine no nobody none noone nor not
-nothing now nowhere
+nothing now nowhere n't
 of off often on once one only onto or other others otherwise our ours ourselves
 out over own
@ -66,4 +66,6 @@ whereafter whereas whereby wherein whereupon wherever whether which while
 whither who whoever whole whom whose why will with within without would
 yet you your yours yourself yourselves
 'd 'll 'm 're 's 've
 """.split())
--- a/spacy/lang/es/syntax_iterators.py
+++ b/spacy/lang/es/syntax_iterators.py
@ -6,17 +6,19 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
 def noun_chunks(obj):
    doc = obj.doc
-    np_label = doc.vocab.strings['NP']
+    if not len(doc):
        return
    np_label = doc.vocab.strings.add('NP')
    left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
    right_labels = ['flat', 'fixed', 'compound', 'neg']
    stop_labels = ['punct']
-    np_left_deps = [doc.vocab.strings[label] for label in left_labels]
+    np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
-    np_right_deps = [doc.vocab.strings[label] for label in right_labels]
+    np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
-    stop_deps = [doc.vocab.strings[label] for label in stop_labels]
+    stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
    token = doc[0]
    while token and token.i < len(doc):
        if token.pos in [PROPN, NOUN, PRON]:
-            left, right = noun_bounds(token)
+            left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
            yield left.i, right.i+1, np_label
            token = right
        token = next_token(token)
@ -33,7 +35,7 @@ def next_token(token):
        return None
-def noun_bounds(root):
+def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
    left_bound = root
    for token in reversed(list(root.lefts)):
        if token.dep in np_left_deps:
@ -41,7 +43,7 @@ def noun_bounds(root):
    right_bound = root
    for token in root.rights:
        if (token.dep in np_right_deps):
-            left, right = noun_bounds(token)
+            left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
            if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
                           doc[left_bound.i: right.i])):
                break
--- a/spacy/lang/fi/examples.py
+++ b/spacy/lang/fi/examples.py
@ -0,0 +1,15 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.fi.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple harkitsee ostavansa startup-yrityksen UK:sta 1 miljardilla dollarilla.",
    "Itseajavat autot siirtävät vakuutusriskin valmistajille.",
    "San Francisco harkitsee jakelurobottien kieltämistä jalkakäytävillä.",
    "Lontoo on iso kaupunki Iso-Britanniassa."
 ]
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@ -0,0 +1,26 @@
 # coding: utf8
 from __future__ import unicode_literals
 # import the symbols for the attrs you want to overwrite
 from ...attrs import LIKE_NUM
 # check if token resembles a number
 _num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'seitsemän', 'kahdeksan', 'yhdeksän', 'kymmenen', 'yksitoista', 'kaksitoista', 'kolmetoista' 'neljätoista', 'viisitoista', 'kuusitoista', 'seitsemäntoista', 'kahdeksantoista', 'yhdeksäntoista', 'kaksikymmentä', 'kolmekymmentä', 'neljäkymmentä', 'viisikymmentä', 'kuusikymmentä'v, 'seitsemänkymmentä', 'kahdeksankymmentä', 'yhdeksänkymmentä', 'sata', 'tuhat', 'miljoona', 'miljardi', 'triljoona']
 def like_num(text):
    text = text.replace('.', '').replace(',', '')
    if text.isdigit():
        return True
    if text.count('/') == 1:
        num, denom = text.split('/')
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    return False
 LEX_ATTRS = {
    LIKE_NUM: like_num
 }
--- a/spacy/lang/fi/stop_words.py
+++ b/spacy/lang/fi/stop_words.py
@ -79,7 +79,7 @@ pienestä pieni pienin poikki puolesta puolestaan päälle
 runsaasti
-saakka sama samaa samaan samalla saman samat samoin sata sataa satojen se
+saakka sama samaa samaan samalla saman samat samoin satojen se
 seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi
 sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun
 sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta
@ -89,7 +89,7 @@ taa taas taemmas tahansa tai takaa takaisin takana takia tallä tapauksessa
 tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä
 teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta
 toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa
-toisesta toista toistaiseksi toki tosin tuhannen tuhat tule tulee tulemme tulen
+toisesta toista toistaiseksi toki tosin tule tulee tulemme tulen
 tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte
 tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi
 tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
    def from_disk(self, path, **exclude):
        return self
 class JapaneseCharacterSegmenter(object):
    def __init__(self, vocab):
        self.vocab = vocab
    def __call__(self, text):
        words = []
        spaces = []
        doc = self.tokenizer(text)
        for token in self.tokenizer(text):
            words.extend(list(token.text))
            spaces.extend([False]*len(token.text))
            spaces[-1] = bool(token.whitespace_)
        return Doc(self.vocab, words=words, spaces=spaces)
 class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'ja'
    use_janome = True
    @classmethod
    def create_tokenizer(cls, nlp=None):
        if cls.use_janome:
            return JapaneseTokenizer(cls, nlp)
        else:
            return JapaneseCharacterSegmenter(cls, nlp.vocab)
 class Japanese(Language):
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
 def is_space(string): return string.isspace()
 def is_title(string): return string.istitle()
 def is_upper(string): return string.isupper()
-def is_stop(string, stops=set()): return string in stops
+def is_stop(string, stops=set()): return string.lower() in stops
 def is_oov(string): return True
 def get_prob(string): return -20.
--- a/spacy/lang/pl/init.py
+++ b/spacy/lang/pl/init.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -17,6 +18,7 @@ class PolishDefaults(Language.Defaults):
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
 class Polish(Language):
--- a/spacy/lang/pl/tag_map.py
+++ b/spacy/lang/pl/tag_map.py
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@ -1,7 +1,7 @@
 # encoding: utf8
 from __future__ import unicode_literals
-from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP
+from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
 _exc = {}
@ -12,24 +12,11 @@ for exc_data in [
    {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
    {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
    {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
-    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
+    {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
    {ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
    {ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
    {ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
    {ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
    {ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
    {ORTH: "nt.", LEMMA: "na temat", POS: ADP},
    {ORTH: "ok.", LEMMA: "około"},
    {ORTH: "n.p.u.", LEMMA: "na psa urok"},
    {ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
-    "w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.",
+    "w.", "r."]:
    "wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
    "min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
    "ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
    "wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
    _exc[orth] = [{ORTH: orth}]
--- a/spacy/lang/tag_map.py
+++ b/spacy/lang/tag_map.py
@ -24,5 +24,5 @@ TAG_MAP = {
    "ADJ":      {POS: ADJ},
    "VERB":     {POS: VERB},
    "PART":     {POS: PART},
-    "SP":     	{POS: SPACE}
+    "_SP":     	{POS: SPACE}
 }
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -0,0 +1,19 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc
 class VietnameseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'vi'  # for pickling
 class Vietnamese(Language):
    lang = 'vi'
    Defaults = VietnameseDefaults  # override defaults
 __all__ = ['Vietnamese']
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -9,6 +9,7 @@ from ...tokens import Doc
 class ChineseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'zh'  # for pickling
    use_jieba = True
 class Chinese(Language):
@ -16,14 +17,25 @@ class Chinese(Language):
    Defaults = ChineseDefaults  # override defaults
    def make_doc(self, text):
        if self.Defaults.use_jieba:
            try:
                import jieba
            except ImportError:
-            raise ImportError("The Chinese tokenizer requires the Jieba library: "
+                msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
-                              "https://github.com/fxsjy/jieba")
+                       "or install it https://github.com/fxsjy/jieba")
                raise ImportError(msg)
            words = list(jieba.cut(text, cut_all=False))
            words = [x for x in words if x]
            return Doc(self.vocab, words=words, spaces=[False]*len(words))
        else:
            words = []
            spaces = []
            doc = self.tokenizer(text)
            for token in self.tokenizer(text):
                words.extend(list(token.text))
                spaces.extend([False]*len(token.text))
                spaces[-1] = bool(token.whitespace_)
            return Doc(self.vocab, words=words, spaces=spaces)
 __all__ = ['Chinese']
--- a/spacy/language.py
+++ b/spacy/language.py
@ -17,7 +17,7 @@ from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
 from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
-from .pipeline import merge_noun_chunks, merge_entities
+from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .compat import json_dumps, izip, basestring_
 from .gold import GoldParse
 from .scorer import Scorer
@ -108,7 +108,8 @@ class Language(object):
        'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
        'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
        'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
-        'merge_entities': lambda nlp, **cfg: merge_entities
+        'merge_entities': lambda nlp, **cfg: merge_entities,
        'merge_subtokens': lambda nlp, **cfg: merge_subtokens,
    }
    def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
-from .symbols import POS, NOUN, VERB, ADJ, PUNCT
+from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
 from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
@ -27,11 +27,13 @@ class Lemmatizer(object):
            univ_pos = 'adj'
        elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
            univ_pos = 'punct'
        elif univ_pos in (PROPN, 'PROPN'):
            return [string]
        else:
-            return list(set([string.lower()]))
+            return [string.lower()]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
-            return list(set([string.lower()]))
+            return [string.lower()]
        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
                           self.exc.get(univ_pos, {}),
                           self.rules.get(univ_pos, []))
@ -88,6 +90,7 @@ class Lemmatizer(object):
 def lemmatize(string, index, exceptions, rules):
    orig = string
    string = string.lower()
    forms = []
    forms.extend(exceptions.get(string, []))
@ -105,5 +108,5 @@ def lemmatize(string, index, exceptions, rules):
    if not forms:
        forms.extend(oov_forms)
    if not forms:
-        forms.append(string)
+        forms.append(orig)
    return list(set(forms))
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,24 +1,19 @@
 # cython: profile=True
 # cython: infer_types=True
-# coding: utf8
+# cython: profile=True
 from __future__ import unicode_literals
 import ujson
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
-from libcpp.pair cimport pair
+from libc.stdint cimport int32_t, uint64_t, uint16_t
 from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
-from libc.stdint cimport int32_t
+from .typedefs cimport attr_t, hash_t
 from .typedefs cimport attr_t
 from .typedefs cimport hash_t
 from .structs cimport TokenC
-from .tokens.doc cimport Doc, get_token_attr
+from .lexeme cimport attr_id_t
 from .vocab cimport Vocab
-
+from .tokens.doc cimport Doc
 from .tokens.doc cimport get_token_attr
 from .attrs cimport ID, attr_id_t, NULL_ATTR
 from .attrs import IDS
 from .attrs cimport attr_id_t, ID, NULL_ATTR
 from .attrs import FLAG61 as U_ENT
 from .attrs import FLAG60 as B2_ENT
 from .attrs import FLAG59 as B3_ENT
@ -48,29 +43,24 @@ from .attrs import FLAG36 as L9_ENT
 from .attrs import FLAG35 as L10_ENT
-cpdef enum quantifier_t:
+cdef enum action_t:
-    _META
+    REJECT = 0000
-    ONE
+    MATCH = 1000
    ADVANCE = 0100
    RETRY = 0010
    RETRY_EXTEND = 0011
    MATCH_EXTEND = 1001
    MATCH_REJECT = 2000
 cdef enum quantifier_t:
    ZERO
    ZERO_ONE
    ZERO_PLUS
    ONE
    ONE_PLUS
 cdef enum action_t:
    REJECT
    ADVANCE
    REPEAT
    ACCEPT
    ADVANCE_ZERO
    ACCEPT_PREV
    PANIC
 # A "match expression" conists of one or more token patterns
 # Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
 # A state is an (int, pattern pointer) pair, where the int is the start
 # position, and the pattern pointer shows where we're up to
 # in the pattern.
 cdef struct AttrValueC:
    attr_id_t attr
    attr_t value
@ -80,10 +70,231 @@ cdef struct TokenPatternC:
    AttrValueC* attrs
    int32_t nr_attr
    quantifier_t quantifier
    hash_t key
-ctypedef TokenPatternC* TokenPatternC_ptr
+cdef struct ActionC:
-ctypedef pair[int, TokenPatternC_ptr] StateC
+    char emit_match
    char next_state_next_token
    char next_state_same_token
    char same_state_next_token
 cdef struct PatternStateC:
    TokenPatternC* pattern
    int32_t start
    int32_t length
 cdef struct MatchC:
    attr_t pattern_id
    int32_t start
    int32_t length
 cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
    cdef vector[PatternStateC] states
    cdef vector[MatchC] matches
    cdef PatternStateC state
    cdef Pool mem = Pool()
    # TODO: Prefill this with the extra attribute values.
    extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
    # Main loop
    cdef int i, j
    for i in range(doc.length):
        for j in range(n):
            states.push_back(PatternStateC(patterns[j], i, 0))
        transition_states(states, matches, &doc.c[i], extra_attrs[i])
    # Handle matches that end in 0-width patterns
    finish_states(matches, states)
    return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
            for i in range(matches.size())]
 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
        const TokenC* token, const attr_t* extra_attrs) except *:
    cdef int q = 0
    cdef vector[PatternStateC] new_states
    for i in range(states.size()):
        action = get_action(states[i], token, extra_attrs)
        if action == REJECT:
            continue
        state = states[i]
        states[q] = state
        while action in (RETRY, RETRY_EXTEND):
            if action == RETRY_EXTEND:
                new_states.push_back(
                    PatternStateC(pattern=state.pattern, start=state.start,
                                  length=state.length+1))
            states[q].pattern += 1
            action = get_action(states[q], token, extra_attrs)
        if action == REJECT:
            pass
        elif action == ADVANCE:
            states[q].pattern += 1
            states[q].length += 1
            q += 1
        else:
            ent_id = state.pattern[1].attrs.value
            if action == MATCH:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length+1))
            elif action == MATCH_REJECT:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length))
            elif action == MATCH_EXTEND:
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start,
                           length=state.length))
                states[q].length += 1
                q += 1
    states.resize(q)
    for i in range(new_states.size()):
        states.push_back(new_states[i])
 cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
    '''Handle states that end in zero-width patterns.'''
    cdef PatternStateC state
    for i in range(states.size()):
        state = states[i]
        while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
            is_final = get_is_final(state)
            if is_final:
                ent_id = state.pattern[1].attrs.value
                matches.push_back(
                    MatchC(pattern_id=ent_id, start=state.start, length=state.length))
                break
            else:
                state.pattern += 1
 cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
    '''We need to consider:
    a) Does the token match the specification? [Yes, No]
    b) What's the quantifier? [1, 0+, ?]
    c) Is this the last specification? [final, non-final]
    We can transition in the following ways:
    a) Do we emit a match?
    b) Do we add a state with (next state, next token)?
    c) Do we add a state with (next state, same token)?
    d) Do we add a state with (same state, next token)?
    We'll code the actions as boolean strings, so 0000 means no to all 4,
    1000 means match but no states added, etc.
    1:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        0000
      No, non-final
        0000
    0+:
      Yes, final:
        1001
      Yes, non-final:
        0011
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    ?:
      Yes, final:
        1000
      Yes, non-final:
        0100
      No, final:
        1000 (note: Don't include last token!)
      No, non-final:
        0010
    Possible combinations:  1000, 0100, 0000, 1001, 0011, 0010, 
    We'll name the bits "match", "advance", "retry", "extend"
    REJECT = 0000
    MATCH = 1000
    ADVANCE = 0100
    RETRY = 0010
    MATCH_EXTEND = 1001
    RETRY_EXTEND = 0011
    MATCH_REJECT = 2000 # Match, but don't include last token
    Problem: If a quantifier is matching, we're adding a lot of open partials
    '''
    cdef char is_match
    is_match = get_is_match(state, token, extra_attrs)
    quantifier = get_quantifier(state)
    is_final = get_is_final(state)
    if quantifier == ZERO:
        is_match = not is_match
        quantifier = ONE
    if quantifier == ONE:
      if is_match and is_final:
          # Yes, final: 1000
          return MATCH
      elif is_match and not is_final:
          # Yes, non-final: 0100
          return ADVANCE
      elif not is_match and is_final:
          # No, final: 0000
          return REJECT
      else:
          return REJECT
    elif quantifier == ZERO_PLUS:
      if is_match and is_final:
          # Yes, final: 1001
          return MATCH_EXTEND
      elif is_match and not is_final:
          # Yes, non-final: 0011
          return RETRY_EXTEND
      elif not is_match and is_final:
          # No, final 2000 (note: Don't include last token!)
          return MATCH_REJECT
      else:
          # No, non-final 0010
          return RETRY
    elif quantifier == ZERO_ONE:
      if is_match and is_final:
          # Yes, final: 1000
          return MATCH
      elif is_match and not is_final:
          # Yes, non-final: 0100
          return ADVANCE
      elif not is_match and is_final:
          # No, final 2000 (note: Don't include last token!)
          return MATCH_REJECT
      else:
          # No, non-final 0010
          return RETRY
 cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
    spec = state.pattern
    for attr in spec.attrs[:spec.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
            return 0
    else:
        return 1
 cdef char get_is_final(PatternStateC state) nogil:
    if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
        return 1
    else:
        return 0
 cdef char get_quantifier(PatternStateC state) nogil:
    return state.pattern.quantifier
 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -97,6 +308,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
        for j, (attr, value) in enumerate(spec):
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
        pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
    i = len(token_specs)
    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
    pattern[i].attrs[0].attr = ID
@ -105,48 +317,16 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
    return pattern
-cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
+cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
    while pattern.nr_attr != 0:
        pattern += 1
    id_attr = pattern[0].attrs[0]
    assert id_attr.attr == ID
    return id_attr.value
 cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    lookahead = &pattern[1]
    for attr in pattern.attrs[:pattern.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
            if pattern.quantifier == ONE:
                return REJECT
            elif pattern.quantifier == ZERO:
                return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
                return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
            else:
                return PANIC
    if pattern.quantifier == ZERO:
        return REJECT
    elif lookahead.nr_attr == 0:
        return ACCEPT
    elif pattern.quantifier in (ONE, ZERO_ONE):
        return ADVANCE
    elif pattern.quantifier == ZERO_PLUS:
        # This is a bandaid over the 'shadowing' problem described here:
        # https://github.com/explosion/spaCy/issues/864
        next_action = get_action(lookahead, token)
        if next_action is REJECT:
            return REPEAT
        else:
            return ADVANCE_ZERO
    else:
        return PANIC
 def _convert_strings(token_specs, string_store):
    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
-    operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
+    operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
-                 '?': (ZERO_ONE,), '1': (ONE,)}
+                 '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
    tokens = []
    op = ONE
    for spec in token_specs:
@ -176,21 +356,6 @@ def _convert_strings(token_specs, string_store):
    return tokens
 def merge_phrase(matcher, doc, i, matches):
    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
    span = doc[start:end]
    span.merge(ent_type=label, ent_id=ent_id)
 def unpickle_matcher(vocab, patterns, callbacks):
    matcher = Matcher(vocab)
    for key, specs in patterns.items():
        callback = callbacks.get(key, None)
        matcher.add(key, callback, *specs)
    return matcher
 cdef class Matcher:
    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
@ -333,85 +498,9 @@ cdef class Matcher:
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
-        cdef vector[StateC] partials
+        matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
-        cdef int n_partials = 0
+        for i, (key, start, end) in enumerate(matches):
-        cdef int q = 0
+            on_match = self._callbacks.get(key, None)
        cdef int i, token_i
        cdef const TokenC* token
        cdef StateC state
        matches = []
        for token_i in range(doc.length):
            token = &doc.c[token_i]
            q = 0
            # Go over the open matches, extending or finalizing if able.
            # Otherwise, we over-write them (q doesn't advance)
            for state in partials:
                action = get_action(state.second, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
                    state.second += 1
                    action = get_action(state.second, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
                    # (i.e. we don't overwrite -- we want to greedily match
                    # more pattern.
                    q += 1
                elif action == REJECT:
                    pass
                elif action == ADVANCE:
                    partials[q] = state
                    partials[q].second += 1
                    q += 1
                elif action in (ACCEPT, ACCEPT_PREV):
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
                    start = state.first
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = state.second[1].attrs[0].value
                    label = state.second[1].attrs[1].value
                    matches.append((ent_id, start, end))
            partials.resize(q)
            # Check whether we open any new patterns on this token
            for pattern in self.patterns:
                action = get_action(pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
                    pattern += 1
                    action = get_action(pattern, token)
                if action == REPEAT:
                    state.first = token_i
                    state.second = pattern
                    partials.push_back(state)
                elif action == ADVANCE:
                    # TODO: What to do about patterns starting with ZERO? Need
                    # to adjust the start position.
                    state.first = token_i
                    state.second = pattern + 1
                    partials.push_back(state)
                elif action in (ACCEPT, ACCEPT_PREV):
                    start = token_i
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = pattern[1].attrs[0].value
                    label = pattern[1].attrs[1].value
                    matches.append((ent_id, start, end))
        # Look for open patterns that are actually satisfied
        for state in partials:
            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
                state.second += 1
                if state.second.nr_attr == 0:
                    start = state.first
                    end = len(doc)
                    ent_id = state.second.attrs[0].value
                    label = state.second.attrs[0].value
                    matches.append((ent_id, start, end))
        for i, (ent_id, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
                on_match(self, doc, i, matches)
        return matches
@ -423,31 +512,37 @@ cdef class Matcher:
            return key
 def unpickle_matcher(vocab, patterns, callbacks):
    matcher = Matcher(vocab)
    for key, specs in patterns.items():
        callback = callbacks.get(key, None)
        matcher.add(key, callback, *specs)
    return matcher
 def _get_longest_matches(matches):
    '''Filter out matches that have a longer equivalent.'''
    longest_matches = {}
    for pattern_id, start, end in matches:
        key = (pattern_id, start)
        length = end-start
        if key not in longest_matches or length > longest_matches[key]:
            longest_matches[key] = length
    return [(pattern_id, start, start+length)
              for (pattern_id, start), length in longest_matches.items()]
 def get_bilou(length):
-    if length == 1:
+    if length == 0:
        raise ValueError("Length must be >= 1")
    elif length == 1:
        return [U_ENT]
    elif length == 2:
        return [B2_ENT, L2_ENT]
    elif length == 3:
        return [B3_ENT, I3_ENT, L3_ENT]
    elif length == 4:
        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
    elif length == 5:
        return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
    elif length == 6:
        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
    elif length == 7:
        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
    elif length == 8:
        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
    elif length == 9:
        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
                L9_ENT]
    elif length == 10:
        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
                I10_ENT, I10_ENT, L10_ENT]
    else:
-        raise ValueError("Max length currently 10 for phrase matching")
+        return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
 cdef class PhraseMatcher:
@ -456,21 +551,21 @@ cdef class PhraseMatcher:
    cdef Matcher matcher
    cdef PreshMap phrase_ids
    cdef int max_length
    cdef attr_t* _phrase_key
    cdef public object _callbacks
    cdef public object _patterns
    def __init__(self, Vocab vocab, max_length=10):
        self.mem = Pool()
        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
        self.max_length = max_length
        self.vocab = vocab
        self.matcher = Matcher(self.vocab)
        self.phrase_ids = PreshMap()
-        abstract_patterns = []
+        abstract_patterns = [
-        for length in range(1, max_length):
+            [{U_ENT: True}],
-            abstract_patterns.append([{tag: True}
+            [{B2_ENT: True}, {L2_ENT: True}],
-                                      for tag in get_bilou(length)])
+            [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
            [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
        ]
        self.matcher.add('Candidate', None, *abstract_patterns)
        self._callbacks = {}
@ -504,29 +599,24 @@ cdef class PhraseMatcher:
        *docs (Doc): `Doc` objects representing match patterns.
        """
        cdef Doc doc
        for doc in docs:
            if len(doc) >= self.max_length:
                msg = (
                    "Pattern length (%d) >= phrase_matcher.max_length (%d). "
                    "Length can be set on initialization, up to 10."
                )
                raise ValueError(msg % (len(doc), self.max_length))
        cdef hash_t ent_id = self.matcher._normalize_key(key)
        self._callbacks[ent_id] = on_match
        cdef int length
        cdef int i
        cdef hash_t phrase_hash
        cdef Pool mem = Pool()
        for doc in docs:
            length = doc.length
            if length == 0:
                continue
            tags = get_bilou(length)
-            for i in range(self.max_length):
+            phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
                self._phrase_key[i] = 0
            for i, tag in enumerate(tags):
                lexeme = self.vocab[doc.c[i].lex.orth]
                lexeme.set_flag(tag, True)
-                self._phrase_key[i] = lexeme.orth
+                phrase_key[i] = lexeme.orth
-            phrase_hash = hash64(self._phrase_key,
+            phrase_hash = hash64(phrase_key,
-                                 self.max_length * sizeof(attr_t), 0)
+                                 length * sizeof(attr_t), 0)
            self.phrase_ids.set(phrase_hash, <void*>ent_id)
    def __call__(self, Doc doc):
@ -548,28 +638,45 @@ cdef class PhraseMatcher:
                on_match(self, doc, i, matches)
        return matches
-    def pipe(self, stream, batch_size=1000, n_threads=2):
+    def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
             as_tuples=False):
        """Match a stream of documents, yielding them in turn.
        docs (iterable): A stream of documents.
        batch_size (int): Number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
            in parallel, if the implementation supports multi-threading.
        return_matches (bool): Yield the match lists along with the docs, making
            results (doc, matches) tuples.
        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
            and yield (result, context) tuples out.
            If both return_matches and as_tuples are True, the output will
            be a sequence of ((doc, matches), context) tuples.
        YIELDS (Doc): Documents, in order.
        """
        if as_tuples:
            for doc, context in stream:
                matches = self(doc)
                if return_matches:
                    yield ((doc, matches), context)
                else:
                    yield (doc, context)
        else:
            for doc in stream:
-            self(doc)
+                matches = self(doc)
                if return_matches:
                    yield (doc, matches) 
                else:
                    yield doc
    def accept_match(self, Doc doc, int start, int end):
        assert (end - start) < self.max_length
        cdef int i, j
-        for i in range(self.max_length):
+        cdef Pool mem = Pool()
-            self._phrase_key[i] = 0
+        phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
        for i, j in enumerate(range(start, end)):
-            self._phrase_key[i] = doc.c[j].lex.orth
+            phrase_key[i] = doc.c[j].lex.orth
-        cdef hash_t key = hash64(self._phrase_key,
+        cdef hash_t key = hash64(phrase_key,
-                                 self.max_length * sizeof(attr_t), 0)
+                                 (end-start) * sizeof(attr_t), 0)
        ent_id = <hash_t>self.phrase_ids.get(key)
        if ent_id == 0:
            return None
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -47,7 +47,9 @@ cdef class Morphology:
 cdef enum univ_morph_t:
    NIL = 0
    Animacy_anim = symbols.Animacy_anim
-    Animacy_inam
+    Animacy_inan
    Animacy_hum
    Animacy_nhum
    Aspect_freq
    Aspect_imp
    Aspect_mod
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -184,7 +184,9 @@ cdef class Morphology:
 IDS = {
    "Animacy_anim": Animacy_anim,
-    "Animacy_inam": Animacy_inam,
+    "Animacy_inan": Animacy_inan,
    "Animacy_hum": Animacy_hum, # U20
    "Animacy_nhum": Animacy_nhum,
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -25,6 +25,7 @@ from .morphology cimport Morphology
 from .vocab cimport Vocab
 from .syntax import nonproj
 from .compat import json_dumps
 from .matcher import Matcher
 from .attrs import POS
 from .parts_of_speech import X
@ -97,6 +98,17 @@ def merge_entities(doc):
    return doc
 def merge_subtokens(doc, label='subtok'):
    merger = Matcher(doc.vocab)
    merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
    matches = merger(doc)
    spans = [doc[start:end+1] for _, start, end in matches]
    offsets = [(span.start_char, span.end_char) for span in spans]
    for start_char, end_char in offsets:
        doc.merge(start_char, end_char)
    return doc
 class Pipe(object):
    """This class is not instantiated directly. Components inherit from it, and
    it defines the interface that components should follow to function as
@ -652,11 +664,13 @@ class MultitaskObjective(Tagger):
            self.make_label = self.make_dep_tag_offset
        elif target == 'ent_tag':
            self.make_label = self.make_ent_tag
        elif target == 'sent_start':
            self.make_label = self.make_sent_start
        elif hasattr(target, '__call__'):
            self.make_label = target
        else:
            raise ValueError("MultitaskObjective target should be function or "
-                             "one of: dep, tag, ent, dep_tag_offset, ent_tag.")
+                             "one of: dep, tag, ent, sent_start, dep_tag_offset, ent_tag.")
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)
        self.cfg.setdefault('pretrained_dims',
@ -716,11 +730,7 @@ class MultitaskObjective(Tagger):
        for i, gold in enumerate(golds):
            for j in range(len(docs[i])):
                # Handes alignment for tokenization differences
-                gold_idx = gold.cand_to_gold[j]
+                label = self.make_label(j, gold.words, gold.tags,
                if gold_idx is None:
                    idx += 1
                    continue
                label = self.make_label(gold_idx, gold.words, gold.tags,
                                        gold.heads, gold.labels, gold.ents)
                if label is None or label not in self.labels:
                    correct[idx] = guesses[idx]
@ -765,6 +775,51 @@ class MultitaskObjective(Tagger):
        else:
            return '%s-%s' % (tags[i], ents[i])
    @staticmethod
    def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
        '''A multi-task objective for representing sentence boundaries,
        using BILU scheme. (O is impossible)
        The implementation of this method uses an internal cache that relies
        on the identity of the heads array, to avoid requiring a new piece
        of gold data. You can pass cache=False if you know the cache will
        do the wrong thing.
        '''
        assert len(words) == len(heads)
        assert target < len(words), (target, len(words))
        if cache:
            if id(heads) in _cache:
                return _cache[id(heads)][target]
            else:
                for key in list(_cache.keys()):
                    _cache.pop(key)
            sent_tags = ['I-SENT'] * len(words)
            _cache[id(heads)] = sent_tags
        else:
            sent_tags = ['I-SENT'] * len(words)
        def _find_root(child):
            seen = set([child])
            while child is not None and heads[child] != child:
                seen.add(child)
                child = heads[child]
            return child
        sentences = {}
        for i in range(len(words)):
            root = _find_root(i)
            if root is None:
                sent_tags[i] = None
            else:
                sentences.setdefault(root, []).append(i)
        for root, span in sorted(sentences.items()):
            if len(span) == 1:
                sent_tags[span[0]] = 'U-SENT'
            else:
                sent_tags[span[0]] = 'B-SENT'
                sent_tags[span[-1]] = 'L-SENT'
        return sent_tags[target]
 class SimilarityHook(Pipe):
    """
@ -823,8 +878,8 @@ class TextCategorizer(Pipe):
    name = 'textcat'
    @classmethod
-    def Model(cls, nr_class=1, width=64, **cfg):
+    def Model(cls, **cfg):
-        return build_text_classifier(nr_class, width, **cfg)
+        return build_text_classifier(**cfg)
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
@ -890,6 +945,15 @@ class TextCategorizer(Pipe):
        if label in self.labels:
            return 0
        if self.model not in (None, True, False):
            # This functionality was available previously, but was broken.
            # The problem is that we resize the last layer, but the last layer
            # is actually just an ensemble. We're not resizing the child layers
            # -- a huge problem.
            raise ValueError(
                "Cannot currently add labels to pre-trained text classifier. "
                "Add labels before training begins. This functionality was "
                "available in previous versions, but had significant bugs that "
                "let to poor performance")
            smaller = self.model._layers[-1]
            larger = Affine(len(self.labels)+1, smaller.nI)
            copy_array(larger.W[:smaller.nO], smaller.W)
@ -905,8 +969,9 @@ class TextCategorizer(Pipe):
            token_vector_width = 64
        if self.model is True:
            self.cfg['pretrained_dims'] = self.vocab.vectors_length
-            self.model = self.Model(len(self.labels), token_vector_width,
+            self.cfg['nr_class'] = len(self.labels)
-                                    **self.cfg)
+            self.cfg['width'] = token_vector_width
            self.model = self.Model(**self.cfg)
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import division, print_function, unicode_literals
-from .gold import tags_to_entities
+from .gold import tags_to_entities, GoldParse
 class PRFScore(object):
@ -84,6 +84,8 @@ class Scorer(object):
        }
    def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
        if len(tokens) != len(gold):
            gold = GoldParse.from_annot_tuples(tokens, zip(*gold.orig_annot))
        assert len(tokens) == len(gold)
        gold_deps = set()
        gold_tags = set()
@ -100,7 +102,6 @@ class Scorer(object):
                continue
            gold_i = gold.cand_to_gold[token.i]
            if gold_i is None:
                if token.dep_.lower() not in punct_labels:
                self.tokens.fp += 1
            else:
                self.tokens.tp += 1
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -85,6 +85,7 @@ cdef enum symbol_t:
    SENT_START
    SPACY
    PROB
    LANG
    ADJ
    ADP
@ -108,8 +109,9 @@ cdef enum symbol_t:
    SPACE
    Animacy_anim
-    Animacy_inam
+    Animacy_inan
    Animacy_hum # U20
    Animacy_nhum
    Aspect_freq
    Aspect_imp
    Aspect_mod
@ -393,6 +395,7 @@ cdef enum symbol_t:
    EVENT
    WORK_OF_ART
    LANGUAGE
    LAW
    DATE
    TIME
@ -451,10 +454,9 @@ cdef enum symbol_t:
    prt
    punct
    quantmod
    relcl
    rcmod
    root
    xcomp
    acl
    LAW
    LANG
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -114,8 +114,9 @@ IDS = {
    "SPACE": SPACE,
    "Animacy_anim": Animacy_anim,
-    "Animacy_inam": Animacy_inam,
+    "Animacy_inam": Animacy_inan,
    "Animacy_hum": Animacy_hum, # U20
    "Animacy_nhum": Animacy_nhum,
    "Aspect_freq": Aspect_freq,
    "Aspect_imp": Aspect_imp,
    "Aspect_mod": Aspect_mod,
@ -458,6 +459,7 @@ IDS = {
    "punct": punct,
    "quantmod": quantmod,
    "rcmod": rcmod,
    "relcl": relcl,
    "root": root,
    "xcomp": xcomp,
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -108,7 +108,7 @@ cdef cppclass StateC:
            ids[1] = this.B(1)
            ids[2] = this.S(0)
            ids[3] = this.S(1)
-            ids[4] = this.H(this.S(0))
+            ids[4] = this.S(2)
            ids[5] = this.L(this.B(0), 1)
            ids[6] = this.L(this.S(0), 1)
            ids[7] = this.R(this.S(0), 1)
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -6,16 +6,19 @@ from __future__ import unicode_literals
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict, Counter
 from thinc.extra.search cimport Beam
 import json
 from .stateclass cimport StateClass
 from ._state cimport StateC
-from .nonproj import is_nonproj_tree
+from . import nonproj
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse, GoldParseC
 from ..structs cimport TokenC
 # Calculate cost as gold/not gold. We don't use scalar value anyway.
 cdef int BINARY_COSTS = 1
 DEF NON_MONOTONIC = True
 DEF USE_BREAK = True
@ -54,6 +57,8 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
            cost += 1
        if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
            cost += 1
        if BINARY_COSTS and cost >= 1:
            return cost
    cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
    return cost
@ -67,6 +72,8 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
        cost += gold.heads[target] == B_i
        if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
            break
        if BINARY_COSTS and cost >= 1:
            return cost
    if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
        cost += 1
    return cost
@ -110,7 +117,8 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
 cdef class Shift:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1
+        sent_start = st._sent[st.B_(0).l_edge].sent_start
        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and sent_start != 1
    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -170,7 +178,8 @@ cdef class Reduce:
 cdef class LeftArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return st.B_(0).sent_start != 1
+        sent_start = st._sent[st.B_(0).l_edge].sent_start
        return sent_start != 1
    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -205,7 +214,8 @@ cdef class RightArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        # If there's (perhaps partial) parse pre-set, don't allow cycle.
-        return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
+        sent_start = st._sent[st.B_(0).l_edge].sent_start
        return sent_start != 1 and st.H(st.S(0)) != st.B(0)
    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -312,39 +322,42 @@ cdef class ArcEager(TransitionSystem):
    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions', OrderedDict((
+        min_freq = kwargs.get('min_freq', None)
-            (SHIFT, ['']),
+        actions = defaultdict(lambda: Counter())
-            (REDUCE, ['']),
+        actions[SHIFT][''] = 1
-            (RIGHT, []),
+        actions[REDUCE][''] = 1
            (LEFT, []),
            (BREAK, ['ROOT']))
        ))
        seen_actions = set()
        for label in kwargs.get('left_labels', []):
-            if label.upper() != 'ROOT':
+            actions[LEFT][label] = 1
-                if (LEFT, label) not in seen_actions:
+            actions[SHIFT][label] = 1
                    actions[LEFT].append(label)
                    seen_actions.add((LEFT, label))
        for label in kwargs.get('right_labels', []):
-            if label.upper() != 'ROOT':
+            actions[RIGHT][label] = 1
-                if (RIGHT, label) not in seen_actions:
+            actions[REDUCE][label] = 1
                    actions[RIGHT].append(label)
                    seen_actions.add((RIGHT, label))
        for raw_text, sents in kwargs.get('gold_parses', []):
            for (ids, words, tags, heads, labels, iob), ctnts in sents:
                heads, labels = nonproj.projectivize(heads, labels)
                for child, head, label in zip(ids, heads, labels):
                    if label.upper() == 'ROOT' :
                        label = 'ROOT'
-                    if label != 'ROOT':
+                    if head == child:
-                        if head < child:
+                        actions[BREAK][label] += 1
-                            if (RIGHT, label) not in seen_actions:
+                    elif head < child:
-                                actions[RIGHT].append(label)
+                        actions[RIGHT][label] += 1
-                                seen_actions.add((RIGHT, label))
+                        actions[REDUCE][''] += 1
                    elif head > child:
-                            if (LEFT, label) not in seen_actions:
+                        actions[LEFT][label] += 1
-                                actions[LEFT].append(label)
+                        actions[SHIFT][''] += 1
-                                seen_actions.add((LEFT, label))
+        if min_freq is not None:
            for action, label_freqs in actions.items():
                for label, freq in list(label_freqs.items()):
                    if freq < min_freq:
                        label_freqs.pop(label)
        # Ensure these actions are present
        actions[BREAK].setdefault('ROOT', 0)
        actions[RIGHT].setdefault('subtok', 0)
        actions[LEFT].setdefault('subtok', 0)
        # Used for backoff
        actions[RIGHT].setdefault('dep', 0)
        actions[LEFT].setdefault('dep', 0)
        return actions
    property action_types:
@ -376,18 +389,34 @@ cdef class ArcEager(TransitionSystem):
    def preprocess_gold(self, GoldParse gold):
        if not self.has_gold(gold):
            return None
-        for i in range(gold.length):
+        for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)):
            # Missing values
-            if gold.heads[i] is None or gold.labels[i] is None:
+            if head is None or dep is None:
                gold.c.heads[i] = i
                gold.c.has_dep[i] = False
            else:
-                label = gold.labels[i]
+                if head > i:
                    action = LEFT
                elif head < i:
                    action = RIGHT
                else:
                    action = BREAK
                if dep not in self.labels[action]:
                    if action == BREAK:
                        dep = 'ROOT'
                    elif nonproj.is_decorated(dep):
                        backoff = nonproj.decompose(dep)[0]
                        if backoff in self.labels[action]:
                            dep = backoff
                        else:
                            dep = 'dep'
                    else:
                        dep = 'dep'
                gold.c.has_dep[i] = True
-                if label.upper() == 'ROOT':
+                if dep.upper() == 'ROOT':
-                    label = 'ROOT'
+                    dep = 'ROOT'
-                gold.c.heads[i] = gold.heads[i]
+                gold.c.heads[i] = head
-                gold.c.labels[i] = self.strings.add(label)
+                gold.c.labels[i] = self.strings.add(dep)
        return gold
    def get_beam_parses(self, Beam beam):
@ -527,8 +556,13 @@ cdef class ArcEager(TransitionSystem):
                is_valid[i] = False
                costs[i] = 9000
        if n_gold < 1:
-            # Check projectivity --- leading cause
+            # Check label set --- leading cause
-            if is_nonproj_tree(gold.heads):
+            label_set = set([self.strings[self.c[i].label] for i in range(self.n_moves)])
            for label_str in gold.labels:
                if label_str is not None and label_str not in label_set:
                    raise ValueError("Cannot get gold parser action: unknown label: %s" % label_str)
            # Check projectivity --- other leading cause
            if nonproj.is_nonproj_tree(gold.heads):
                raise ValueError(
                    "Could not find a gold-standard action to supervise the "
                    "dependency parser. Likely cause: the tree is "
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -3,7 +3,7 @@ from __future__ import unicode_literals
 from thinc.typedefs cimport weight_t
 from thinc.extra.search cimport Beam
-from collections import OrderedDict
+from collections import OrderedDict, Counter
 from .stateclass cimport StateClass
 from ._state cimport StateC
@ -64,21 +64,18 @@ cdef class BiluoPushDown(TransitionSystem):
    @classmethod
    def get_actions(cls, **kwargs):
-        actions = kwargs.get('actions', OrderedDict((
+        actions = {
-            (MISSING, ['']),
+            MISSING: Counter(),
-            (BEGIN, []),
+            BEGIN: Counter(),
-            (IN, []),
+            IN: Counter(),
-            (LAST, []),
+            LAST: Counter(),
-            (UNIT, []),
+            UNIT: Counter(),
-            (OUT, [''])
+            OUT: Counter()
-        )))
+        }
-        seen_entities = set()
+        actions[OUT][''] = 1
        for entity_type in kwargs.get('entity_types', []):
            if entity_type in seen_entities:
                continue
            seen_entities.add(entity_type)
            for action in (BEGIN, IN, LAST, UNIT):
-                actions[action].append(entity_type)
+                actions[action][entity_type] = 1
        moves = ('M', 'B', 'I', 'L', 'U')
        for raw_text, sents in kwargs.get('gold_parses', []):
            for (ids, words, tags, heads, labels, biluo), _ in sents:
@ -87,10 +84,8 @@ cdef class BiluoPushDown(TransitionSystem):
                        if ner_tag.count('-') != 1:
                            raise ValueError(ner_tag)
                        _, label = ner_tag.split('-')
-                        if label not in seen_entities:
+                        for action in (BEGIN, IN, LAST, UNIT):
-                            seen_entities.add(label)
+                            actions[action][label] += 1
                            for move_str in ('B', 'I', 'L', 'U'):
                                actions[moves.index(move_str)].append(label)
        return actions
    property action_types:
@ -213,7 +208,7 @@ cdef class BiluoPushDown(TransitionSystem):
            raise Exception(move)
        return t
-    def add_action(self, int action, label_name):
+    def add_action(self, int action, label_name, freq=None):
        cdef attr_t label_id
        if not isinstance(label_name, (int, long)):
            label_id = self.strings.add(label_name)
@ -234,6 +229,12 @@ cdef class BiluoPushDown(TransitionSystem):
        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
        if self.labels.get(action, []):
            freq = min(0, min(self.labels[action].values()))
            self.labels[action][label_name] = freq-1
        else:
            self.labels[action] = Counter()
            self.labels[action][label_name] = -1
        return 1
    cdef int initialize_state(self, StateC* st) nogil:
--- a/spacy/syntax/nn_parser.pxd
+++ b/spacy/syntax/nn_parser.pxd
@ -15,7 +15,7 @@ cdef class Parser:
    cdef readonly object cfg
    cdef public object _multitasks
-    cdef void _parseC(self, StateC* state, 
+    cdef void _parseC(self, StateC** states, int nr_task, 
            const float* feat_weights, const float* bias,
            const float* hW, const float* hb,
            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -1,7 +1,6 @@
 # cython: infer_types=True
 # cython: cdivision=True
 # cython: boundscheck=False
 # cython: profile=True
 # coding: utf-8
 from __future__ import unicode_literals, print_function
@ -28,6 +27,8 @@ from thinc.misc import LayerNorm
 from thinc.neural.ops import CupyOps
 from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
 from thinc cimport openblas
 from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
 from .._ml import link_vectors_to_models, create_default_optimizer
@ -266,7 +267,7 @@ cdef class Parser:
        with Model.use_device('cpu'):
            upper = chain(
-                clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1),
+                clone(Maxout(hidden_width, hidden_width), depth-1),
                zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
            )
@ -302,7 +303,7 @@ cdef class Parser:
        """
        self.vocab = vocab
        if moves is True:
-            self.moves = self.TransitionSystem(self.vocab.strings, {})
+            self.moves = self.TransitionSystem(self.vocab.strings)
        else:
            self.moves = moves
        if 'beam_width' not in cfg:
@ -311,12 +312,7 @@ cdef class Parser:
            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
        if 'pretrained_dims' not in cfg:
            cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
        cfg.setdefault('cnn_maxout_pieces', 3)
        self.cfg = cfg
        if 'actions' in self.cfg:
            for action, labels in self.cfg.get('actions', {}).items():
                for label in labels:
                    self.moves.add_action(action, label)
        self.model = model
        self._multitasks = []
@ -423,69 +419,81 @@ cdef class Parser:
        cdef int nr_hidden = hidden_weights.shape[0]
        cdef int nr_task = states.size()
        with nogil:
-            for i in range(nr_task):
+            self._parseC(&states[0], nr_task, feat_weights, bias, hW, hb,
                self._parseC(states[i],
                    feat_weights, bias, hW, hb,
                nr_class, nr_hidden, nr_feat, nr_piece)
        PyErr_CheckSignals()
        tokvecs = self.model[0].ops.unflatten(tokvecs,
                                    [len(doc) for doc in docs])
        return state_objs, tokvecs
-    cdef void _parseC(self, StateC* state, 
+    cdef void _parseC(self, StateC** states, int nr_task, 
            const float* feat_weights, const float* bias,
            const float* hW, const float* hb,
            int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
        token_ids = <int*>calloc(nr_feat, sizeof(int))
        is_valid = <int*>calloc(nr_class, sizeof(int))
-        vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
+        vectors = <float*>calloc(nr_hidden * nr_task, sizeof(float))
-        scores = <float*>calloc(nr_class, sizeof(float))
+        unmaxed = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
        scores = <float*>calloc(nr_class*nr_task, sizeof(float))
        if not (token_ids and is_valid and vectors and scores):
            with gil:
                PyErr_SetFromErrno(MemoryError)
                PyErr_CheckSignals()
-        cdef float feature
+        cdef int nr_todo = nr_task
-        while not state.is_final():
+        cdef int i, j
        cdef vector[StateC*] unfinished
        while nr_todo >= 1:
            memset(vectors, 0, nr_todo * nr_hidden * sizeof(float))
            memset(scores, 0, nr_todo * nr_class * sizeof(float))
            for i in range(nr_todo):
                state = states[i]
                state.set_context_tokens(token_ids, nr_feat)
-            memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
+                memset(unmaxed, 0, nr_hidden * nr_piece * sizeof(float))
-            memset(scores, 0, nr_class * sizeof(float))
+                sum_state_features(unmaxed,
            sum_state_features(vectors,
                    feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
-            for i in range(nr_hidden * nr_piece):
+                VecVec.add_i(unmaxed,
-                vectors[i] += bias[i]
+                    bias, 1., nr_hidden*nr_piece)
-            V = vectors
+                state_vector = &vectors[i*nr_hidden]
-            W = hW
+                for j in range(nr_hidden):
-            for i in range(nr_hidden):
+                    index = j * nr_piece
-                if nr_piece == 1:
+                    which = Vec.arg_max(&unmaxed[index], nr_piece)
-                    feature = V[0] if V[0] >= 0. else 0.
+                    state_vector[j] = unmaxed[index + which]
-                elif nr_piece == 2:
+            # Compute hidden-to-output
-                    feature = V[0] if V[0] >= V[1] else V[1]
+            openblas.simple_gemm(scores, nr_todo, nr_class,
-                else:
+                vectors, nr_todo, nr_hidden, hW, nr_hidden, nr_class, 0, 0)
-                    feature = Vec.max(V, nr_piece)
+            # Add bias
-                for j in range(nr_class):
+            for i in range(nr_todo):
-                    scores[j] += feature * W[j]
+                VecVec.add_i(&scores[i*nr_class],
-                W += nr_class
+                    hb, 1., nr_class)
-                V += nr_piece
+            # Validate actions, argmax, take action.
-            for i in range(nr_class):
+            for i in range(nr_todo):
-                scores[i] += hb[i]
+                state = states[i]
                self.moves.set_valid(is_valid, state)
-            guess = arg_max_if_valid(scores, is_valid, nr_class)
+                guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
                action = self.moves.c[guess]
                action.do(state, action.label)
                state.push_hist(guess)
                if not state.is_final():
                    unfinished.push_back(state)
            for i in range(unfinished.size()):
                states[i] = unfinished[i]
            nr_todo = unfinished.size()
            unfinished.clear()
        free(token_ids)
        free(is_valid)
        free(vectors)
        free(unmaxed)
        free(scores)
-    def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
+    def beam_parse(self, docs, int beam_width=3, float beam_density=0.001,
            float drop=0.):
        cdef Beam beam
        cdef np.ndarray scores
        cdef Doc doc
        cdef int nr_class = self.moves.n_moves
        cuda_stream = util.get_cuda_stream()
        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
-            docs, cuda_stream, 0.0)
+            docs, cuda_stream, drop)
        cdef int offset = 0
        cdef int j = 0
        cdef int k
@ -524,8 +532,8 @@ cdef class Parser:
                        n_states += 1
            if n_states == 0:
                break
-            vectors = state2vec(token_ids[:n_states])
+            vectors, _ = state2vec.begin_update(token_ids[:n_states], drop)
-            scores = vec2scores(vectors)
+            scores, _ = vec2scores.begin_update(vectors, drop=drop)
            c_scores = <float*>scores.data
            for beam in todo:
                for i in range(beam.size):
@ -556,7 +564,10 @@ cdef class Parser:
        for multitask in self._multitasks:
            multitask.update(docs, golds, drop=drop, sgd=sgd)
        cuda_stream = util.get_cuda_stream()
-        states, golds, max_steps = self._init_gold_batch(docs, golds)
+        # Chop sequences into lengths of this many transitions, to make the
        # batch uniform length.
        cut_gold = numpy.random.choice(range(20, 100))
        states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
                                                                            drop)
        todo = [(s, g) for (s, g) in zip(states, golds)
@ -659,8 +670,7 @@ cdef class Parser:
        for beam in beams:
            _cleanup(beam)
-
+    def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
    def _init_gold_batch(self, whole_docs, whole_golds):
        """Make a square batch, of length equal to the shortest doc. A long
        doc will get multiple states. Let's say we have a doc of length 2*N,
        where N is the shortest doc. We'll make two states, one representing
@ -669,7 +679,7 @@ cdef class Parser:
            StateClass state
            Transition action
        whole_states = self.moves.init_batch(whole_docs)
-        max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
+        max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
        max_moves = 0
        states = []
        golds = []
@ -791,6 +801,11 @@ cdef class Parser:
                for doc in docs:
                    hook(doc)
    @property
    def labels(self):
        class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
        return class_names
    @property
    def tok2vec(self):
        '''Return the embedding and convolutional layer of the model.'''
@ -809,9 +824,6 @@ cdef class Parser:
        for action in self.moves.action_types:
            added = self.moves.add_action(action, label)
            if added:
                # Important that the labels be stored as a list! We need the
                # order, or the model goes out of synch
                self.cfg.setdefault('extra_labels', []).append(label)
                resized = True
        if self.model not in (True, False, None) and resized:
            # Weights are stored in (nr_out, nr_in) format, so we're basically
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -9,7 +9,7 @@ from __future__ import unicode_literals
 from copy import copy
-from ..tokens.doc cimport Doc
+from ..tokens.doc cimport Doc, set_children_from_heads
 DELIMITER = '||'
@ -74,7 +74,21 @@ def decompose(label):
 def is_decorated(label):
-    return label.find(DELIMITER) != -1
+    return DELIMITER in label
 def count_decorated_labels(gold_tuples):
    freqs = {}
    for raw_text, sents in gold_tuples:
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
            proj_heads, deco_labels = projectivize(heads, labels)
            # set the label to ROOT for each root dependent
            deco_labels = ['ROOT' if head == i else deco_labels[i]
                           for i, head in enumerate(proj_heads)]
            # count label frequencies
            for label in deco_labels:
                if is_decorated(label):
                    freqs[label] = freqs.get(label, 0) + 1
    return freqs
 def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
@ -124,8 +138,9 @@ cpdef deprojectivize(Doc doc):
        if DELIMITER in label:
            new_label, head_label = label.split(DELIMITER)
            new_head = _find_new_head(doc[i], head_label)
-            doc[i].head = new_head
+            doc.c[i].head = new_head.i - i
            doc.c[i].dep = doc.vocab.strings.add(new_label)
    set_children_from_heads(doc.c, doc.length)
    return doc
@ -191,9 +206,12 @@ def _filter_labels(gold_tuples, cutoff, freqs):
    for raw_text, sents in gold_tuples:
        filtered_sents = []
        for (ids, words, tags, heads, labels, iob), ctnts in sents:
-            filtered_labels = [decompose(label)[0]
+            filtered_labels = []
-                               if freqs.get(label, cutoff) < cutoff
+            for label in labels:
-                               else label for label in labels]
+                if is_decorated(label) and freqs.get(label, 0) < cutoff:
                    filtered_labels.append(decompose(label)[0])
                else:
                    filtered_labels.append(label)
            filtered_sents.append(
                ((ids, words, tags, heads, filtered_labels, iob), ctnts))
        filtered.append((raw_text, filtered_sents))
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -42,6 +42,7 @@ cdef class TransitionSystem:
    cdef public attr_t root_label
    cdef public freqs
    cdef init_state_t init_beam_state
    cdef public object labels
    cdef int initialize_state(self, StateC* state) nogil
    cdef int finalize_state(self, StateC* state) nogil
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -5,7 +5,7 @@ from __future__ import unicode_literals
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
-from collections import OrderedDict
+from collections import OrderedDict, Counter
 import ujson
 from ..structs cimport TokenC
@ -28,7 +28,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
 cdef class TransitionSystem:
-    def __init__(self, StringStore string_table, labels_by_action):
+    def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
        self.mem = Pool()
        self.strings = string_table
        self.n_moves = 0
@ -36,21 +36,14 @@ cdef class TransitionSystem:
        self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
-        for action, label_strs in labels_by_action.items():
+        self.labels = {}
-            for label_str in label_strs:
+        if labels_by_action:
-                self.add_action(int(action), label_str)
+            self.initialize_actions(labels_by_action, min_freq=min_freq)
        self.root_label = self.strings.add('ROOT')
        self.init_beam_state = _init_state
    def __reduce__(self):
-        labels_by_action = OrderedDict()
+        return (self.__class__, (self.strings, self.labels), None, None)
        cdef Transition t
        for trans in self.c[:self.n_moves]:
            label_str = self.strings[trans.label]
            labels_by_action.setdefault(trans.move, []).append(label_str)
        return (self.__class__,
                (self.strings, labels_by_action),
                None, None)
    def init_batch(self, docs):
        cdef StateClass state
@ -146,6 +139,22 @@ cdef class TransitionSystem:
        act = self.c[clas]
        return self.move_name(act.move, act.label)
    def initialize_actions(self, labels_by_action, min_freq=None):
        self.labels = {}
        self.n_moves = 0
        for action, label_freqs in sorted(labels_by_action.items()):
            action = int(action)
            # Make sure we take a copy here, and that we get a Counter
            self.labels[action] = Counter()
            # Have to be careful here: Sorting must be stable, or our model
            # won't be read back in correctly. 
            sorted_labels = [(f, L) for L, f in label_freqs.items()]
            sorted_labels.sort()
            sorted_labels.reverse()
            for freq, label_str in sorted_labels:
                self.add_action(int(action), label_str)
                self.labels[action][label_str] = freq 
    def add_action(self, int action, label_name):
        cdef attr_t label_id
        if not isinstance(label_name, int) and \
@ -164,6 +173,14 @@ cdef class TransitionSystem:
        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
        assert self.c[self.n_moves].label == label_id
        self.n_moves += 1
        if self.labels.get(action, []):
            new_freq = min(self.labels[action].values())
        else:
            self.labels[action] = Counter()
            new_freq = -1
        if new_freq > 0:
            new_freq = 0
        self.labels[action][label_name] = new_freq-1
        return 1
    def to_disk(self, path, **exclude):
@ -178,26 +195,18 @@ cdef class TransitionSystem:
    def to_bytes(self, **exclude):
        transitions = []
        for trans in self.c[:self.n_moves]:
            transitions.append({
                'clas': trans.clas,
                'move': trans.move,
                'label': self.strings[trans.label],
                'name': self.move_name(trans.move, trans.label)
            })
        serializers = {
-            'transitions': lambda: json_dumps(transitions),
+            'moves': lambda: json_dumps(self.labels),
            'strings': lambda: self.strings.to_bytes()
        }
        return util.to_bytes(serializers, exclude)
    def from_bytes(self, bytes_data, **exclude):
-        transitions = []
+        labels = {}
        deserializers = {
-            'transitions': lambda b: transitions.extend(ujson.loads(b)),
+            'moves': lambda b: labels.update(ujson.loads(b)),
            'strings': lambda b: self.strings.from_bytes(b)
        }
        msg = util.from_bytes(bytes_data, deserializers, exclude)
-        for trans in transitions:
+        self.initialize_actions(labels)
            self.add_action(trans['move'], trans['label'])
        return self
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -19,6 +19,15 @@ def doc(en_tokenizer):
    return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
@pytest.fixture
 def doc_not_parsed(en_tokenizer):
    text = "This is a sentence. This is another sentence. And a third."
    tokens = en_tokenizer(text)
    d = get_doc(tokens.vocab, [t.text for t in tokens])
    d.is_parsed = False
    return d
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
    assert span.root.text == 'sentence'
    assert span.root.head.text == 'is'
 def test_spans_string_fn(doc):
    span = doc[0:4]
    assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
    assert span.upper_ == 'THIS IS A SENTENCE'
    assert span.lower_ == 'this is a sentence'
 def test_spans_root2(en_tokenizer):
    text = "through North and South Carolina"
    heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
    assert doc[-2:].root.text == 'Carolina'
-def test_spans_span_sent(doc):
+def test_spans_span_sent(doc, doc_not_parsed):
    """Test span.sent property"""
    assert len(list(doc.sents))
    assert doc[:2].sent.root.text == 'is'
    assert doc[:2].sent.text == 'This is a sentence .'
    assert doc[6:7].sent.root.left_edge.text == 'This'
    # test on manual sbd
    doc_not_parsed[0].is_sent_start = True
    doc_not_parsed[5].is_sent_start = True
    assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
    assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
 def test_spans_lca_matrix(en_tokenizer):
@ -129,7 +145,7 @@ def test_span_to_array(doc):
    assert arr[0, 1] == len(span[0])
-def test_span_as_doc(doc):
+#def test_span_as_doc(doc):
-    span = doc[4:10]
+#    span = doc[4:10]
-    span_doc = span.as_doc()
+#    span_doc = span.as_doc()
-    assert span.text == span_doc.text.strip()
+#    assert span.text == span_doc.text.strip()
--- a/spacy/tests/gold/test_lev_align.py
+++ b/spacy/tests/gold/test_lev_align.py
@ -1,36 +0,0 @@
 # coding: utf-8
 """Find the min-cost alignment between two tokenizations"""
 from __future__ import unicode_literals
 from ...gold import _min_edit_path as min_edit_path
 from ...gold import align
 import pytest
@pytest.mark.parametrize('cand,gold,path', [
    (["U.S", ".", "policy"], ["U.S.", "policy"], (0, 'MDM')),
    (["U.N", ".", "policy"], ["U.S.", "policy"], (1, 'SDM')),
    (["The", "cat", "sat", "down"], ["The", "cat", "sat", "down"], (0, 'MMMM')),
    (["cat", "sat", "down"], ["The", "cat", "sat", "down"], (1, 'IMMM')),
    (["The", "cat", "down"], ["The", "cat", "sat", "down"], (1, 'MMIM')),
    (["The", "cat", "sag", "down"], ["The", "cat", "sat", "down"], (1, 'MMSM'))])
 def test_gold_lev_align_edit_path(cand, gold, path):
    assert min_edit_path(cand, gold) == path
 def test_gold_lev_align_edit_path2():
    cand = ["your", "stuff"]
    gold = ["you", "r", "stuff"]
    assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')]
@pytest.mark.parametrize('cand,gold,result', [
    (["U.S", ".", "policy"], ["U.S.", "policy"], [0, None, 1]),
    (["your", "stuff"], ["you", "r", "stuff"], [None, 2]),
    (["i", "like", "2", "guys", "   ", "well", "id", "just", "come", "straight", "out"],
     ["i", "like", "2", "guys", "well", "i", "d", "just", "come", "straight", "out"],
     [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10])])
 def test_gold_lev_align(cand, gold, result):
    assert align(cand, gold) == result
--- a/spacy/tests/lang/en/test_tagger.py
+++ b/spacy/tests/lang/en/test_tagger.py
@ -2,9 +2,9 @@
 from __future__ import unicode_literals
 from ....parts_of_speech import SPACE
 from ....compat import unicode_
 from ...util import get_doc
 import six
 import pytest
@ -24,8 +24,8 @@ def test_tag_names(EN):
    text = "I ate pizzas with anchovies."
    doc = EN(text, disable=['parser'])
    assert type(doc[2].pos) == int
-    assert isinstance(doc[2].pos_, six.text_type)
+    assert isinstance(doc[2].pos_, unicode_)
-    assert isinstance(doc[2].dep_, six.text_type)
+    assert isinstance(doc[2].dep_, unicode_)
    assert doc[2].tag_ == u'NNS'
--- a/spacy/tests/parser/test_arc_eager_oracle.py
+++ b/spacy/tests/parser/test_arc_eager_oracle.py
@ -0,0 +1,75 @@
 from __future__ import unicode_literals
 from ...vocab import Vocab
 from ...pipeline import DependencyParser
 from ...tokens import Doc
 from ...gold import GoldParse
 from ...syntax.nonproj import projectivize
 annot_tuples = [
    (0, 'When', 'WRB', 11, 'advmod', 'O'),
    (1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
    (2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
    (3, ',', ',', 2, 'punct', 'O'),
    (4, 'our', 'PRP$', 6, 'poss', 'O'),
    (5, 'embedded', 'VBN', 6, 'amod', 'O'),
    (6, 'reporter', 'NN', 2, 'appos', 'O'),
    (7, 'with', 'IN', 6, 'prep', 'O'),
    (8, 'the', 'DT', 10, 'det', 'B-ORG'),
    (9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
    (10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
    (11, 'says', 'VBZ', 44, 'advcl', 'O'),
    (12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
    (13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
    (14, 'of', 'IN', 13, 'prep', 'O'),
    (15, 'troops', 'NNS', 14, 'pobj', 'O'),
    (16, 'are', 'VBP', 11, 'ccomp', 'O'),
    (17, 'on', 'IN', 16, 'prep', 'O'),
    (18, 'the', 'DT', 19, 'det', 'O'),
    (19, 'ground', 'NN', 17, 'pobj', 'O'),
    (20, ',', ',', 17, 'punct', 'O'),
    (21, 'inside', 'IN', 17, 'prep', 'O'),
    (22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
    (23, 'itself', 'PRP', 22, 'appos', 'O'),
    (24, ',', ',', 16, 'punct', 'O'),
    (25, 'have', 'VBP', 26, 'aux', 'O'),
    (26, 'taken', 'VBN', 16, 'dep', 'O'),
    (27, 'up', 'RP', 26, 'prt', 'O'),
    (28, 'positions', 'NNS', 26, 'dobj', 'O'),
    (29, 'they', 'PRP', 31, 'nsubj', 'O'),
    (30, "'re", 'VBP', 31, 'aux', 'O'),
    (31, 'going', 'VBG', 26, 'parataxis', 'O'),
    (32, 'to', 'TO', 33, 'aux', 'O'),
    (33, 'spend', 'VB', 31, 'xcomp', 'O'),
    (34, 'the', 'DT', 35, 'det', 'B-TIME'), 
    (35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
    (36, 'there', 'RB', 33, 'advmod', 'O'),
    (37, 'presumably', 'RB', 33, 'advmod', 'O'),
    (38, ',', ',', 44, 'punct', 'O'),
    (39, 'how', 'WRB', 40, 'advmod', 'O'),
    (40, 'many', 'JJ', 41, 'amod', 'O'),
    (41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
    (42, 'are', 'VBP', 44, 'aux', 'O'),
    (43, 'we', 'PRP', 44, 'nsubj', 'O'),
    (44, 'talking', 'VBG', 44, 'ROOT', 'O'),
    (45, 'about', 'IN', 44, 'prep', 'O'),
    (46, 'right', 'RB', 47, 'advmod', 'O'),
    (47, 'now', 'RB', 44, 'advmod', 'O'),
    (48, '?', '.', 44, 'punct', 'O')]
 def test_get_oracle_actions():
    doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
    parser = DependencyParser(doc.vocab)
    parser.moves.add_action(0, '')
    parser.moves.add_action(1, '')
    parser.moves.add_action(1, '')
    parser.moves.add_action(4, 'ROOT')
    for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
        if head > i:
            parser.moves.add_action(2, dep)
        elif head < i:
            parser.moves.add_action(3, dep)
    ids, words, tags, heads, deps, ents = zip(*annot_tuples)
    heads, deps = projectivize(heads, deps)
    gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
    parser.moves.preprocess_gold(gold)
    actions = parser.moves.get_oracle_sequence(doc, gold)
--- a/spacy/tests/regression/test_issue1450.py
+++ b/spacy/tests/regression/test_issue1450.py
@ -13,8 +13,8 @@ from ...vocab import Vocab
        ('a b', 0, 2),
        ('a c', 0, 1),
        ('a b c', 0, 2),
-        ('a b b c', 0, 2),
+        ('a b b c', 0, 3),
-        ('a b b', 0, 2),
+        ('a b b', 0, 3),
    ]
 )
 def test_issue1450_matcher_end_zero_plus(string, start, end):
@ -54,5 +54,6 @@ def test_issue1450_matcher_end_zero_plus(string, start, end):
    if start is None or end is None:
        assert matches == []
-    assert matches[0][1] == start
+    print(matches)
-    assert matches[0][2] == end
+    assert matches[-1][1] == start
    assert matches[-1][2] == end
--- a/spacy/tests/regression/test_issue1855.py
+++ b/spacy/tests/regression/test_issue1855.py
@ -0,0 +1,65 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from ...matcher import Matcher
 import pytest
 pattern1	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
 pattern2	= [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
 pattern3	= [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
 pattern4	= [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 pattern5 	= [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
 re_pattern1	= 'AA*'
 re_pattern2 = 'A*A'
 re_pattern3	= 'AA'
 re_pattern4	= 'BA*B'
 re_pattern5	= 'B*A*B'
@pytest.fixture
 def text():
 	return "(ABBAAAAAB)."
@pytest.fixture
 def doc(en_tokenizer,text):
    doc = en_tokenizer(' '.join(text))
    return doc
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_greedy_matching(doc,text,pattern,re_pattern):
 	"""
 	Test that the greedy matching behavior of the * op
 	is consistant with other re implementations
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	for match,re_match in zip(matches,re_matches):
 		assert match[1:]==re_match
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern',[
 	(pattern1,re_pattern1),
 	(pattern2,re_pattern2),
 	(pattern3,re_pattern3),
 	(pattern4,re_pattern4),
 	(pattern5,re_pattern5)])
 def test_match_consuming(doc,text,pattern,re_pattern):
 	"""
 	Test that matcher.__call__ consumes tokens on a match
 	similar to re.findall
 	"""
 	matcher = Matcher(doc.vocab)
 	matcher.add(re_pattern,None,pattern)
 	matches = matcher(doc)
 	re_matches = [m.span() for m in re.finditer(re_pattern,text)]
 	assert len(matches)==len(re_matches)
--- a/spacy/tests/regression/test_issue1889.py
+++ b/spacy/tests/regression/test_issue1889.py
@ -0,0 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ...lang.lex_attrs import is_stop
 from ...lang.en.stop_words import STOP_WORDS
 import pytest
@pytest.mark.parametrize('word', ['the'])
 def test_lex_attrs_stop_words_case_sensitivity(word):
    assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
--- a/spacy/tests/regression/test_issue1945.py
+++ b/spacy/tests/regression/test_issue1945.py
@ -6,7 +6,6 @@ from ...vocab import Vocab
 from ...tokens import Doc
 from ...matcher import Matcher
@pytest.mark.xfail
 def test_issue1945():
    text = "a a a"
    matcher = Matcher(Vocab())
--- a/spacy/tests/regression/test_issue850.py
+++ b/spacy/tests/regression/test_issue850.py
@ -22,10 +22,9 @@ def test_basic_case():
    assert end == 4
@pytest.mark.xfail
 def test_issue850():
-    """The problem here is that the variable-length pattern matches the
+    """The variable-length pattern matches the
-    succeeding token. We then don't handle the ambiguity correctly."""
+    succeeding token. Check we handle the ambiguity correctly."""
    matcher = Matcher(Vocab(
                lex_attr_getters={LOWER: lambda string: string.lower()}))
    IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
--- a/spacy/tests/test_align.py
+++ b/spacy/tests/test_align.py
@ -0,0 +1,66 @@
 from __future__ import unicode_literals
 import pytest
 from .._align import align, multi_align
@pytest.mark.parametrize('string1,string2,cost', [
    ('hello', 'hell', 1),
    ('rat', 'cat', 1),
    ('rat', 'rat', 0),
    ('rat', 'catsie', 4),
    ('t', 'catsie', 5),
 ])
 def test_align_costs(string1, string2, cost):
    output_cost, i2j, j2i, matrix = align(string1, string2)
    assert output_cost == cost
@pytest.mark.parametrize('string1,string2,i2j', [
    ('hello', 'hell', [0,1,2,3,-1]),
    ('rat', 'cat', [0,1,2]),
    ('rat', 'rat', [0,1,2]),
    ('rat', 'catsie', [0,1,2]),
    ('t', 'catsie', [2]),
 ])
 def test_align_i2j(string1, string2, i2j):
    output_cost, output_i2j, j2i, matrix = align(string1, string2)
    assert list(output_i2j) == i2j
@pytest.mark.parametrize('string1,string2,j2i', [
    ('hello', 'hell', [0,1,2,3]),
    ('rat', 'cat', [0,1,2]),
    ('rat', 'rat', [0,1,2]),
    ('rat', 'catsie', [0,1,2, -1, -1, -1]),
    ('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
 ])
 def test_align_i2j(string1, string2, j2i):
    output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
    assert list(output_j2i) == j2i
 def test_align_strings():
    words1 = ['hello', 'this', 'is', 'test!']
    words2 = ['hellothis', 'is', 'test', '!']
    cost, i2j, j2i, matrix = align(words1, words2)
    assert cost == 4
    assert list(i2j) == [-1, -1, 1, -1]
    assert list(j2i) == [-1, 2, -1, -1]
 def test_align_many_to_one():
    words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
    words2 = ['ab', 'bc', 'e', 'fg', 'h']
    cost, i2j, j2i, matrix = align(words1, words2)
    assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
    lengths1 = [len(w) for w in words1]
    lengths2 = [len(w) for w in words2]
    i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
    assert i2j_multi[0] == 0
    assert i2j_multi[1] == 0
    assert i2j_multi[2] == 1
    assert i2j_multi[3] == 1
    assert i2j_multi[3] == 1
    assert i2j_multi[5] == 3
    assert i2j_multi[6] == 3
    assert j2i_multi[0] == 1
    assert j2i_multi[1] == 3
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@ -3,12 +3,17 @@ from __future__ import unicode_literals
 from ..matcher import Matcher, PhraseMatcher
 from .util import get_doc
 from ..util import get_lang_class
 from ..tokens import Doc
 import pytest
@pytest.fixture(scope="session")
 def en_vocab():
    return get_lang_class('en').Defaults.create_vocab()
-@pytest.fixture
+
@pytest.fixture(scope="session")
 def matcher(en_vocab):
    rules = {
        'JS':        [[{'ORTH': 'JavaScript'}]],
@ -21,187 +26,196 @@ def matcher(en_vocab):
    return matcher
-def test_matcher_from_api_docs(en_vocab):
+#def test_matcher_from_api_docs(en_vocab):
-    matcher = Matcher(en_vocab)
+#    matcher = Matcher(en_vocab)
-    pattern = [{'ORTH': 'test'}]
+#    pattern = [{'ORTH': 'test'}]
-    assert len(matcher) == 0
+#    assert len(matcher) == 0
-    matcher.add('Rule', None, pattern)
+#    matcher.add('Rule', None, pattern)
-    assert len(matcher) == 1
+#    assert len(matcher) == 1
-    matcher.remove('Rule')
+#    matcher.remove('Rule')
-    assert 'Rule' not in matcher
+#    assert 'Rule' not in matcher
-    matcher.add('Rule', None, pattern)
+#    matcher.add('Rule', None, pattern)
-    assert 'Rule' in matcher
+#    assert 'Rule' in matcher
-    on_match, patterns = matcher.get('Rule')
+#    on_match, patterns = matcher.get('Rule')
-    assert len(patterns[0])
+#    assert len(patterns[0])
 #
 #
 #def test_matcher_from_usage_docs(en_vocab):
 #    text = "Wow 😀 This is really cool! 😂 😂"
 #    doc = get_doc(en_vocab, words=text.split(' '))
 #    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
 #    pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
 #
 #    def label_sentiment(matcher, doc, i, matches):
 #        match_id, start, end = matches[i]
 #        if doc.vocab.strings[match_id] == 'HAPPY':
 #            doc.sentiment += 0.1
 #        span = doc[start : end]
 #        token = span.merge()
 #        token.vocab[token.text].norm_ = 'happy emoji'
 #
 #    matcher = Matcher(en_vocab)
 #    matcher.add('HAPPY', label_sentiment, *pos_patterns)
 #    matches = matcher(doc)
 #    assert doc.sentiment != 0
 #    assert doc[1].norm_ == 'happy emoji'
-def test_matcher_from_usage_docs(en_vocab):
+#@pytest.mark.parametrize('words', [["Some", "words"]])
-    text = "Wow 😀 This is really cool! 😂 😂"
+#def test_matcher_init(en_vocab, words):
-    doc = get_doc(en_vocab, words=text.split(' '))
+#    matcher = Matcher(en_vocab)
-    pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
+#    doc = get_doc(en_vocab, words)
-    pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
+#    assert len(matcher) == 0
-
+#    assert matcher(doc) == []
-    def label_sentiment(matcher, doc, i, matches):
+#
-        match_id, start, end = matches[i]
+#
-        if doc.vocab.strings[match_id] == 'HAPPY':
+#def test_matcher_contains(matcher):
-            doc.sentiment += 0.1
+#    matcher.add('TEST', None, [{'ORTH': 'test'}])
-        span = doc[start : end]
+#    assert 'TEST' in matcher
-        token = span.merge()
+#    assert 'TEST2' not in matcher
-        token.vocab[token.text].norm_ = 'happy emoji'
+#
-
+#
-    matcher = Matcher(en_vocab)
+#def test_matcher_no_match(matcher):
-    matcher.add('HAPPY', label_sentiment, *pos_patterns)
+#    words = ["I", "like", "cheese", "."]
-    matches = matcher(doc)
+#    doc = get_doc(matcher.vocab, words)
-    assert doc.sentiment != 0
+#    assert matcher(doc) == []
-    assert doc[1].norm_ == 'happy emoji'
+#
-
+#
-
+#def test_matcher_compile(en_vocab):
-@pytest.mark.parametrize('words', [["Some", "words"]])
+#    rules = {
-def test_matcher_init(en_vocab, words):
+#        'JS':        [[{'ORTH': 'JavaScript'}]],
-    matcher = Matcher(en_vocab)
+#        'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
-    doc = get_doc(en_vocab, words)
+#        'Java':      [[{'LOWER': 'java'}]]
-    assert len(matcher) == 0
+#    }
-    assert matcher(doc) == []
+#    matcher = Matcher(en_vocab)
-
+#    for key, patterns in rules.items():
-
+#        matcher.add(key, None, *patterns)
-def test_matcher_contains(matcher):
+#    assert len(matcher) == 3
-    matcher.add('TEST', None, [{'ORTH': 'test'}])
+#
-    assert 'TEST' in matcher
+#
-    assert 'TEST2' not in matcher
+#def test_matcher_match_start(matcher):
-
+#    words = ["JavaScript", "is", "good"]
-
+#    doc = get_doc(matcher.vocab, words)
-def test_matcher_no_match(matcher):
+#    assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
-    words = ["I", "like", "cheese", "."]
+#
-    doc = get_doc(matcher.vocab, words)
+#
-    assert matcher(doc) == []
+#def test_matcher_match_end(matcher):
-
+#    words = ["I", "like", "java"]
-
+#    doc = get_doc(matcher.vocab, words)
-def test_matcher_compile(matcher):
+#    assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
-    assert len(matcher) == 3
+#
-
+#
-
+#def test_matcher_match_middle(matcher):
-def test_matcher_match_start(matcher):
+#    words = ["I", "like", "Google", "Now", "best"]
-    words = ["JavaScript", "is", "good"]
+#    doc = get_doc(matcher.vocab, words)
-    doc = get_doc(matcher.vocab, words)
+#    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
-    assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
+#
-
+#
-
+#def test_matcher_match_multi(matcher):
-def test_matcher_match_end(matcher):
+#    words = ["I", "like", "Google", "Now", "and", "java", "best"]
-    words = ["I", "like", "java"]
+#    doc = get_doc(matcher.vocab, words)
-    doc = get_doc(matcher.vocab, words)
+#    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
-    assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
+#                            (doc.vocab.strings['Java'], 5, 6)]
-
+#
-
+#
-def test_matcher_match_middle(matcher):
+#def test_matcher_empty_dict(en_vocab):
-    words = ["I", "like", "Google", "Now", "best"]
+#    '''Test matcher allows empty token specs, meaning match on any token.'''
-    doc = get_doc(matcher.vocab, words)
+#    matcher = Matcher(en_vocab)
-    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
+#    abc = ["a", "b", "c"]
-
+#    doc = get_doc(matcher.vocab, abc)
-
+#    matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
-def test_matcher_match_multi(matcher):
+#    matches = matcher(doc)
-    words = ["I", "like", "Google", "Now", "and", "java", "best"]
+#    assert len(matches) == 1
-    doc = get_doc(matcher.vocab, words)
+#    assert matches[0][1:] == (0, 3)
-    assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
+#    matcher = Matcher(en_vocab)
-                            (doc.vocab.strings['Java'], 5, 6)]
+#    matcher.add('A.', None, [{'ORTH': 'a'}, {}])
-
+#    matches = matcher(doc)
-
+#    assert matches[0][1:] == (0, 2)
-def test_matcher_empty_dict(en_vocab):
+#
-    '''Test matcher allows empty token specs, meaning match on any token.'''
+#
-    matcher = Matcher(en_vocab)
+#def test_matcher_operator_shadow(en_vocab):
-    abc = ["a", "b", "c"]
+#    matcher = Matcher(en_vocab)
-    doc = get_doc(matcher.vocab, abc)
+#    abc = ["a", "b", "c"]
-    matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
+#    doc = get_doc(matcher.vocab, abc)
-    matches = matcher(doc)
+#    matcher.add('A.C', None, [{'ORTH': 'a'},
-    assert len(matches) == 1
+#                              {"IS_ALPHA": True, "OP": "+"},
-    assert matches[0][1:] == (0, 3)
+#                              {'ORTH': 'c'}])
-    matcher = Matcher(en_vocab)
+#    matches = matcher(doc)
-    matcher.add('A.', None, [{'ORTH': 'a'}, {}])
+#    assert len(matches) == 1
-    matches = matcher(doc)
+#    assert matches[0][1:] == (0, 3)
-    assert matches[0][1:] == (0, 2)
+#
-
+#
-
+#def test_matcher_phrase_matcher(en_vocab):
-def test_matcher_operator_shadow(en_vocab):
+#    words = ["Google", "Now"]
-    matcher = Matcher(en_vocab)
+#    doc = get_doc(en_vocab, words)
-    abc = ["a", "b", "c"]
+#    matcher = PhraseMatcher(en_vocab)
-    doc = get_doc(matcher.vocab, abc)
+#    matcher.add('COMPANY', None, doc)
-    matcher.add('A.C', None, [{'ORTH': 'a'},
+#    words = ["I", "like", "Google", "Now", "best"]
-                              {"IS_ALPHA": True, "OP": "+"},
+#    doc = get_doc(en_vocab, words)
-                              {'ORTH': 'c'}])
+#    assert len(matcher(doc)) == 1
-    matches = matcher(doc)
+#
-    assert len(matches) == 1
+#
-    assert matches[0][1:] == (0, 3)
+#def test_phrase_matcher_length(en_vocab):
-
+#    matcher = PhraseMatcher(en_vocab)
-
+#    assert len(matcher) == 0
-def test_matcher_phrase_matcher(en_vocab):
+#    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
-    words = ["Google", "Now"]
+#    assert len(matcher) == 1
-    doc = get_doc(en_vocab, words)
+#    matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
-    matcher = PhraseMatcher(en_vocab)
+#    assert len(matcher) == 2
-    matcher.add('COMPANY', None, doc)
+#
-    words = ["I", "like", "Google", "Now", "best"]
+#
-    doc = get_doc(en_vocab, words)
+#def test_phrase_matcher_contains(en_vocab):
-    assert len(matcher(doc)) == 1
+#    matcher = PhraseMatcher(en_vocab)
-
+#    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
-
+#    assert 'TEST' in matcher
-def test_phrase_matcher_length(en_vocab):
+#    assert 'TEST2' not in matcher
-    matcher = PhraseMatcher(en_vocab)
+#
-    assert len(matcher) == 0
+#
-    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
+#def test_matcher_match_zero(matcher):
-    assert len(matcher) == 1
+#    words1 = 'He said , " some words " ...'.split()
-    matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
+#    words2 = 'He said , " some three words " ...'.split()
-    assert len(matcher) == 2
+#    pattern1 = [{'ORTH': '"'},
-
+#                {'OP': '!', 'IS_PUNCT': True},
-
+#                {'OP': '!', 'IS_PUNCT': True},
-def test_phrase_matcher_contains(en_vocab):
+#                {'ORTH': '"'}]
-    matcher = PhraseMatcher(en_vocab)
+#    pattern2 = [{'ORTH': '"'},
-    matcher.add('TEST', None, get_doc(en_vocab, ['test']))
+#                {'IS_PUNCT': True},
-    assert 'TEST' in matcher
+#                {'IS_PUNCT': True},
-    assert 'TEST2' not in matcher
+#                {'IS_PUNCT': True},
-
+#                {'ORTH': '"'}]
-
+#
-def test_matcher_match_zero(matcher):
+#    matcher.add('Quote', None, pattern1)
-    words1 = 'He said , " some words " ...'.split()
+#    doc = get_doc(matcher.vocab, words1)
-    words2 = 'He said , " some three words " ...'.split()
+#    assert len(matcher(doc)) == 1
-    pattern1 = [{'ORTH': '"'},
+#
-                {'OP': '!', 'IS_PUNCT': True},
+#    doc = get_doc(matcher.vocab, words2)
-                {'OP': '!', 'IS_PUNCT': True},
+#    assert len(matcher(doc)) == 0
-                {'ORTH': '"'}]
+#    matcher.add('Quote', None, pattern2)
-    pattern2 = [{'ORTH': '"'},
+#    assert len(matcher(doc)) == 0
-                {'IS_PUNCT': True},
+#
-                {'IS_PUNCT': True},
+#
-                {'IS_PUNCT': True},
+#def test_matcher_match_zero_plus(matcher):
-                {'ORTH': '"'}]
+#    words = 'He said , " some words " ...'.split()
-
+#    pattern = [{'ORTH': '"'},
-    matcher.add('Quote', None, pattern1)
+#               {'OP': '*', 'IS_PUNCT': False},
-    doc = get_doc(matcher.vocab, words1)
+#               {'ORTH': '"'}]
-    assert len(matcher(doc)) == 1
+#    matcher = Matcher(matcher.vocab)
-
+#    matcher.add('Quote', None, pattern)
-    doc = get_doc(matcher.vocab, words2)
+#    doc = get_doc(matcher.vocab, words)
-    assert len(matcher(doc)) == 0
+#    assert len(matcher(doc)) == 1
-    matcher.add('Quote', None, pattern2)
+#
-    assert len(matcher(doc)) == 0
+#
-
+#def test_matcher_match_one_plus(matcher):
-
+#    control = Matcher(matcher.vocab)
-def test_matcher_match_zero_plus(matcher):
+#    control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
-    words = 'He said , " some words " ...'.split()
+#    doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
-    pattern = [{'ORTH': '"'},
+#    m = control(doc)
-               {'OP': '*', 'IS_PUNCT': False},
+#    assert len(m) == 2
-               {'ORTH': '"'}]
+#    matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
-    matcher.add('Quote', None, pattern)
+#                                         {'ORTH': 'Philippe', 'OP': '+'}])
-    doc = get_doc(matcher.vocab, words)
+#    m = matcher(doc)
-    assert len(matcher(doc)) == 1
+#    assert len(m) == 1
-
+#
 def test_matcher_match_one_plus(matcher):
    control = Matcher(matcher.vocab)
    control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
    doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
    m = control(doc)
    assert len(m) == 2
    matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
                                         {'ORTH': 'Philippe', 'OP': '+'}])
    m = matcher(doc)
    assert len(m) == 1
 def test_operator_combos(matcher):
    cases = [
@ -252,9 +266,8 @@ def test_matcher_end_zero_plus(matcher):
    )
    nlp = lambda string: Doc(matcher.vocab, words=string.split())
    assert len(matcher(nlp(u'a'))) == 1
-    assert len(matcher(nlp(u'a b'))) == 1
+    assert len(matcher(nlp(u'a b'))) == 2
    assert len(matcher(nlp(u'a b'))) == 1
    assert len(matcher(nlp(u'a c'))) == 1
-    assert len(matcher(nlp(u'a b c'))) == 1
+    assert len(matcher(nlp(u'a b c'))) == 2
-    assert len(matcher(nlp(u'a b b c'))) == 1
+    assert len(matcher(nlp(u'a b b c'))) == 3
-    assert len(matcher(nlp(u'a b b'))) == 1
+    assert len(matcher(nlp(u'a b b'))) == 3
--- a/spacy/tests/test_textcat.py
+++ b/spacy/tests/test_textcat.py
@ -0,0 +1,44 @@
 from __future__ import unicode_literals
 import random
 import numpy.random
 from ..pipeline import TextCategorizer
 from ..lang.en import English
 from ..vocab import Vocab
 from ..tokens import Doc
 from ..gold import GoldParse
 def test_textcat_learns_multilabel():
    random.seed(0)
    numpy.random.seed(0)
    docs = []
    nlp = English()
    vocab = nlp.vocab
    letters = ['a', 'b', 'c']
    for w1 in letters:
        for w2 in letters:
            cats = {letter: float(w2==letter) for letter in letters}
            docs.append((Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
    random.shuffle(docs)
    model = TextCategorizer(vocab, width=8)
    for letter in letters:
        model.add_label(letter)
    optimizer = model.begin_training()
    for i in range(30):
        losses = {}
        Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
        Xs = [doc for doc, cats in docs]
        model.update(Xs, Ys, sgd=optimizer, losses=losses)
        random.shuffle(docs)
    for w1 in letters:
        for w2 in letters:
            doc = Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3)
            truth = {letter: w2==letter for letter in letters}
            model(doc)
            for cat, score in doc.cats.items():
                if not truth[cat]:
                    assert score < 0.5
                else:
                    assert score > 0.5
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -19,6 +19,9 @@ ctypedef fused LexemeOrToken:
    const_TokenC_ptr
 cdef int set_children_from_heads(TokenC* tokens, int length) except -1
 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -186,6 +186,20 @@ cdef class Doc:
    def _(self):
        return Underscore(Underscore.doc_extensions, self)
    @property
    def is_sentenced(self):
        # Check if the document has sentence boundaries,
        # i.e at least one tok has the sent_start in (-1, 1)
        if 'sents' in self.user_hooks:
            return True
        if self.is_parsed:
            return True
        for i in range(self.length):
            if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
                return True
        else:
            return False
    def __getitem__(self, object i):
        """Get a `Token` or `Span` object.
@ -517,22 +531,16 @@ cdef class Doc:
            >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
        """
        def __get__(self):
-            if 'sents' in self.user_hooks:
+            if not self.is_sentenced:
                yield from self.user_hooks['sents'](self)
                return
            cdef int i
            if not self.is_parsed:
                for i in range(1, self.length):
                    if self.c[i].sent_start != 0:
                        break
                else:
                raise ValueError(
                    "Sentence boundaries unset. You can add the 'sentencizer' "
                    "component to the pipeline with: "
                    "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
                    "Alternatively, add the dependency parser, or set "
                    "sentence boundaries by setting doc[i].sent_start")
            if 'sents' in self.user_hooks:
                yield from self.user_hooks['sents'](self)
            else:
                start = 0
                for i in range(1, self.length):
                    if self.c[i].sent_start == 1:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -285,16 +285,42 @@ cdef class Span:
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
-            # This should raise if we're not parsed.
+            # This should raise if we're not parsed
            # or doesen't have any sbd component :)
            self.doc.sents
            # if doc is parsed we can use the deps to find the sentence
            # otherwise we use the `sent_start` token attribute
            cdef int n = 0
            cdef int i
            if self.doc.is_parsed:
                root = &self.doc.c[self.start]
                n = 0
                while root.head != 0:
                    root += root.head
                    n += 1
                    if n >= self.doc.length:
                        raise RuntimeError
                return self.doc[root.l_edge:root.r_edge + 1]
            elif self.doc.is_sentenced:
                # find start of the sentence
                start = self.start
                while self.doc.c[start].sent_start != 1 and start > 0:
                    start += -1
                # find end of the sentence
                end = self.end
                n = 0
                while end < self.doc.length and self.doc.c[end].sent_start != 1:
                    end += 1
                    n += 1
                    if n >= self.doc.length:
                        break
                #
                return self.doc[start:end]
            else:
                raise ValueError(
                    "Access to sentence requires either the dependency parse "
                    "or sentence boundaries to be set by setting " +
                    "doc[i].is_sent_start = True")
    property has_vector:
        """RETURNS (bool): Whether a word vector is associated with the object.
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -34,11 +34,11 @@ cdef class Token:
    @classmethod
    def get_extension(cls, name):
-        return Underscore.token_extensions.get(name)
+        return Underscore.span_extensions.get(name)
    @classmethod
    def has_extension(cls, name):
-        return name in Underscore.token_extensions
+        return name in Underscore.span_extensions
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        """Construct a `Token` object.
--- a/spacy/util.py
+++ b/spacy/util.py
@ -442,6 +442,29 @@ def decaying(start, stop, decay):
        nr_upd += 1
 def minibatch_by_words(items, size, count_words=len):
    '''Create minibatches of a given number of words.'''
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
    while True:
        batch_size = next(size_)
        batch = []
        while batch_size >= 0:
            try:
                doc, gold = next(items)
            except StopIteration:
                if batch:
                    yield batch
                return
            batch_size -= count_words(doc)
            batch.append((doc, gold))
        if batch:
            yield batch
 def itershuffle(iterable, bufsize=1000):
    """Shuffle an iterator. This works by holding `bufsize` items back
    and yielding them sometime later. Obviously, this is not unbiased –
@ -457,7 +480,7 @@ def itershuffle(iterable, bufsize=1000):
    try:
        while True:
            for i in range(random.randint(1, bufsize-len(buf))):
-                buf.append(iterable.next())
+                buf.append(next(iterable))
            random.shuffle(buf)
            for i in range(random.randint(1, bufsize)):
                if buf:
--- a/website/usage/resources.jade
+++ b/website/usage/resources.jade
@ -120,9 +120,6 @@ include ../_includes/_mixins
            |  A Practical Real-World Approach to Gaining Actionable Insights
            |  from your Data
        +card("Practical Machine Learning with Python", "", "Dipanjan Sarkar et al. (Apress, 2017)", "book")
            |  A Problem-Solver's Guide to Building Real-World Intelligent Systems
 +section("notebooks")
    +h(2, "notebooks") Jupyter notebooks