Merge Span._ and Span.as_doc methods

2025-10-18 17:54:17 +03:00 · 2017-10-09 22:00:15 -05:00 · 2017-10-09 22:00:15 -05:00 · e0a9b02b67
commit e0a9b02b67
parent 6679117000 dce8afb9cf
21 changed files with 333 additions and 97 deletions
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -21,7 +21,6 @@ import thinc.neural._classes.layernorm
 thinc.neural._classes.layernorm.set_compat_six_eight(False)


-
 def train_textcat(tokenizer, textcat,
                  train_texts, train_cats, dev_texts, dev_cats,
                  n_iter=20):
@ -57,13 +56,15 @@ def evaluate(tokenizer, textcat, texts, cats):
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
-            if score >= 0.5 and label in gold:
+            if label not in gold:
+                continue
+            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
-            elif score >= 0.5 and label not in gold:
+            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
-            elif score < 0.5 and label not in gold:
+            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
-            if score < 0.5 and label in gold:
+            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precis = tp / (tp + fp)
    recall = tp / (tp + fn)
@ -80,7 +81,7 @@ def load_data(limit=0):
    train_data = train_data[-limit:]

    texts, labels = zip(*train_data)
-    cats = [(['POSITIVE'] if y else []) for y in labels]
+    cats = [{'POSITIVE': bool(y)} for y in labels]

    split = int(len(train_data) * 0.8)

@ -97,7 +98,7 @@ def main(model_loc=None):
    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])

    print("Load IMDB data")
-    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
+    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)

    print("Itn.\tLoss\tP\tR\tF")
    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -264,7 +264,8 @@ def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
        return layerize(noop())
    embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
                    for i in range(hist_size)]
-    embed = concatenate(*embed_tables)
+    embed = chain(concatenate(*embed_tables),
+                  LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
    ops = embed.ops
    def add_history_fwd(vectors_hists, drop=0.):
        vectors, hist_ids = vectors_hists
@ -742,5 +743,3 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
        return ys, concatenate_lists_bwd
    model = wrap(concatenate_lists_fwd, concat)
    return model
-
-
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,13 +3,13 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy-nightly'
-__version__ = '2.0.0a16'
+__version__ = '2.0.0a17'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
 __email__ = 'contact@explosion.ai'
 __license__ = 'MIT'
-__release__ = True
+__release__ = False

 __docs_models__ = 'https://alpha.spacy.io/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import plac
 from pathlib import Path

-from .converters import conllu2json, iob2json
+from .converters import conllu2json, iob2json, conll_ner2json
 from ..util import prints

 # Converters are matched by file extension. To add a converter, add a new entry
@ -12,9 +12,10 @@ from ..util import prints
 # from /converters.

 CONVERTERS = {
-    '.conllu': conllu2json,
-    '.conll': conllu2json,
-    '.iob': iob2json,
+    'conllu': conllu2json,
+    'conll': conllu2json,
+    'ner': conll_ner2json,
+    'iob': iob2json,
 }


@ -22,9 +23,11 @@ CONVERTERS = {
    input_file=("input file", "positional", None, str),
    output_dir=("output directory for converted file", "positional", None, str),
    n_sents=("Number of sentences per doc", "option", "n", int),
+    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 )
-def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
+def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
+            converter='auto'):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions.
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
        prints(input_path, title="Input file not found", exits=1)
    if not output_path.exists():
        prints(output_path, title="Output directory not found", exits=1)
-    file_ext = input_path.suffix
-    if not file_ext in CONVERTERS:
-        prints("Can't find converter for %s" % input_path.parts[-1],
+    if converter == 'auto':
+        converter = input_path.suffix[1:]
+    if not converter in CONVERTERS:
+            prints("Can't find converter for %s" % converter,
                title="Unknown format", exits=1)
-    CONVERTERS[file_ext](input_path, output_path,
+    func = CONVERTERS[converter]
+    func(input_path, output_path,
         n_sents=n_sents, use_morphology=morphology)
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,2 +1,3 @@
 from .conllu2json import conllu2json
 from .iob2json import iob2json
+from .conll_ner2json import conll_ner2json
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -114,15 +114,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                nlp.to_disk(epoch_model_path)
                nlp_loaded = lang_class(pipeline=pipeline)
                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
-                scorer = nlp_loaded.evaluate(
-                            list(corpus.dev_docs(
+                dev_docs = list(corpus.dev_docs(
                                nlp_loaded,
-                                gold_preproc=gold_preproc)))
+                                gold_preproc=gold_preproc))
+                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+                start_time = timer()
+                scorer = nlp_loaded.evaluate(dev_docs)
+                end_time = timer()
+                if use_gpu < 0:
+                    gpu_wps = None
+                    cpu_wps = nwords/(end_time-start_time)
+                else:
+                    gpu_wps = nwords/(end_time-start_time)
+                    with Model.use_device('cpu'):
+                        nlp_loaded = lang_class(pipeline=pipeline)
+                        nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
+                        dev_docs = list(corpus.dev_docs(
+                                        nlp_loaded, gold_preproc=gold_preproc))
+                        start_time = timer()
+                        scorer = nlp_loaded.evaluate(dev_docs)
+                        end_time = timer()
+                        cpu_wps = nwords/(end_time-start_time)
                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
+                meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
                meta['lang'] = nlp.lang
                meta['pipeline'] = pipeline
                meta['spacy_version'] = '>=%s' % about.__version__
@ -132,7 +150,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
                util.set_env_log(True)
-            print_progress(i, losses, scorer.scores)
+            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
    finally:
        print("Saving model...")
        try:
@ -153,16 +171,17 @@ def _render_parses(i, to_render):
        file_.write(html)


-def print_progress(itn, losses, dev_scores, wps=0.0):
+def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
    scores = {}
    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
-                'ents_p', 'ents_r', 'ents_f', 'wps']:
+                'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
        scores[col] = 0.0
    scores['dep_loss'] = losses.get('parser', 0.0)
    scores['ner_loss'] = losses.get('ner', 0.0)
    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(dev_scores)
-    scores['wps'] = wps
+    scores['cpu_wps'] = cpu_wps
+    scores['gpu_wps'] = gpu_wps or 0.0
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.3f}',
@ -173,7 +192,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
        '{ents_f:.3f}',
        '{tags_acc:.3f}',
        '{token_acc:.3f}',
-        '{wps:.1f}'))
+        '{cpu_wps:.1f}',
+        '{gpu_wps:.1f}',
+    ))
    print(tpl.format(itn, **scores))


--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -387,7 +387,7 @@ cdef class GoldParse:

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False,
-                 cats=tuple()):
+                 cats=None):
        """Create a GoldParse.

        doc (Doc): The document the annotations refer to.
@ -398,12 +398,15 @@ cdef class GoldParse:
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
-        cats (iterable): A sequence of labels for text classification. Each
-            label may be a string or an int, or a `(start_char, end_char, label)`
+        cats (dict): Labels for text classification. Each key in the dictionary
+            may be a string or an int, or a `(start_char, end_char, label)`
            tuple, indicating that the label is applied to only part of the
            document (usually a sentence). Unlike entity annotations, label
            annotations can overlap, i.e. a single word can be covered by
-            multiple labelled spans.
+            multiple labelled spans. The TextCategorizer component expects
+            true examples of a label to have the value 1.0, and negative examples
+            of a label to have the value 0.0. Labels not in the dictionary are
+            treated as missing -- the gradient for those labels will be zero.
        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
@ -434,7 +437,7 @@ cdef class GoldParse:
        self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

-        self.cats = list(cats)
+        self.cats = {} if cats is None else dict(cats)
        self.words = [None] * len(doc)
        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -126,7 +126,7 @@ def word_shape(text):
 LEX_ATTRS = {
    attrs.LOWER: lambda string: string.lower(),
    attrs.NORM: lambda string: string.lower(),
-    attrs.PREFIX: lambda string: string[0],
+    attrs.PREFIX: lambda string: string[:3],
    attrs.SUFFIX: lambda string: string[-3:],
    attrs.CLUSTER: lambda string: 0,
    attrs.IS_ALPHA: lambda string: string.isalpha(),
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -158,11 +158,13 @@ class BaseThincComponent(object):

    def to_bytes(self, **exclude):
        """Serialize the pipe to a bytestring."""
-        serialize = OrderedDict((
-            ('cfg', lambda: json_dumps(self.cfg)),
-            ('model', lambda: self.model.to_bytes()),
-            ('vocab', lambda: self.vocab.to_bytes())
-        ))
+        serialize = OrderedDict()
+        serialize['cfg'] = lambda: json_dumps(self.cfg)
+        if self.model in (True, False, None):
+            serialize['model'] = lambda: self.model
+        else:
+            serialize['model'] = self.model.to_bytes
+        serialize['vocab'] = self.vocab.to_bytes
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, **exclude):
@ -183,11 +185,11 @@ class BaseThincComponent(object):

    def to_disk(self, path, **exclude):
        """Serialize the pipe to disk."""
-        serialize = OrderedDict((
-            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
-            ('vocab', lambda p: self.vocab.to_disk(p)),
-            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
-        ))
+        serialize = OrderedDict()
+        serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
+        serialize['vocab'] = lambda p: self.vocab.to_disk(p)
+        if self.model not in (None, True, False):
+            serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, **exclude):
@ -438,13 +440,16 @@ class NeuralTagger(BaseThincComponent):
            yield

    def to_bytes(self, **exclude):
-        serialize = OrderedDict((
-            ('model', lambda: self.model.to_bytes()),
-            ('vocab', lambda: self.vocab.to_bytes()),
-            ('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
+        serialize = OrderedDict()
+        if self.model in (None, True, False):
+            serialize['model'] = lambda: self.model
+        else:
+            serialize['model'] = self.model.to_bytes
+        serialize['vocab'] = self.vocab.to_bytes
+
+        serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
                                                     use_bin_type=True,
-                                             encoding='utf8'))
-        ))
+                                                     encoding='utf8')
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, **exclude):
@ -552,7 +557,6 @@ class NeuralLabeller(NeuralTagger):
                    label = self.make_label(i, words, tags, heads, deps, ents)
                    if label is not None and label not in self.labels:
                        self.labels[label] = len(self.labels)
-        print(len(self.labels))
        if self.model is True:
            token_vector_width = util.env_opt('token_vector_width')
            self.model = chain(
@ -721,11 +725,17 @@ class TextCategorizer(BaseThincComponent):

    def get_loss(self, docs, golds, scores):
        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
+        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
        for i, gold in enumerate(golds):
            for j, label in enumerate(self.labels):
-                truths[i, j] = label in gold.cats
+                if label in gold.cats:
+                    truths[i, j] = gold.cats[label]
+                else:
+                    not_missing[i, j] = 0.
        truths = self.model.ops.asarray(truths)
+        not_missing = self.model.ops.asarray(not_missing)
        d_scores = (scores-truths) / scores.shape[0]
+        d_scores *= not_missing
        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
        return mean_square_error, d_scores

--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -61,13 +61,13 @@ cdef struct TokenC:
    attr_t sense
    int head
    attr_t dep
-    bint sent_start

    uint32_t l_kids
    uint32_t r_kids
    uint32_t l_edge
    uint32_t r_edge

+    int sent_start
    int ent_iob
    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
    hash_t ent_id
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -307,6 +307,8 @@ cdef cppclass StateC:
            this._stack[this._s_i] = this.B(0)
        this._s_i += 1
        this._b_i += 1
+        if this.B_(0).sent_start == 1:
+            this.set_break(this.B(0))
        if this._b_i > this._break:
            this._break = -1

@ -383,7 +385,7 @@ cdef cppclass StateC:

    void set_break(int i) nogil:
        if 0 <= i < this.length:
-            this._sent[i].sent_start = True
+            this._sent[i].sent_start = 1
            this._break = this._b_i

    void clone(const StateC* src) nogil:
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -118,7 +118,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
 cdef class Shift:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
+        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -178,7 +178,7 @@ cdef class Reduce:
 cdef class LeftArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return not st.B_(0).sent_start
+        return st.B_(0).sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -212,7 +212,7 @@ cdef class LeftArc:
 cdef class RightArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return not st.B_(0).sent_start
+        return st.B_(0).sent_start != 1

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -248,6 +248,10 @@ cdef class Break:
            return False
        elif st.stack_depth() < 1:
            return False
+        elif st.B_(0).l_edge < 0:
+            return False
+        elif st._sent[st.B_(0).l_edge].sent_start < 0:
+            return False
        else:
            return True

--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -219,30 +219,28 @@ cdef class BiluoPushDown(TransitionSystem):
            raise Exception(move)
        return t

-    #def add_action(self, int action, label_name):
-    #    cdef attr_t label_id
-    #    if not isinstance(label_name, (int, long)):
-    #        label_id = self.strings.add(label_name)
-    #    else:
-    #        label_id = label_name
-    #    if action == OUT and label_id != 0:
-    #        return
-    #    if action == MISSING or action == ISNT:
-    #        return
-    #    # Check we're not creating a move we already have, so that this is
-    #    # idempotent
-    #    for trans in self.c[:self.n_moves]:
-    #        if trans.move == action and trans.label == label_id:
-    #            return 0
-    #    if self.n_moves >= self._size:
-    #        self._size *= 2
-    #        self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
-    #    self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
-    #    assert self.c[self.n_moves].label == label_id
-    #    self.n_moves += 1
-    #    return 1
-
-
+    def add_action(self, int action, label_name):
+        cdef attr_t label_id
+        if not isinstance(label_name, (int, long)):
+            label_id = self.strings.add(label_name)
+        else:
+            label_id = label_name
+        if action == OUT and label_id != 0:
+            return
+        if action == MISSING or action == ISNT:
+            return
+        # Check we're not creating a move we already have, so that this is
+        # idempotent
+        for trans in self.c[:self.n_moves]:
+            if trans.move == action and trans.label == label_id:
+                return 0
+        if self.n_moves >= self._size:
+            self._size *= 2
+            self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
+        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
+        assert self.c[self.n_moves].label == label_id
+        self.n_moves += 1
+        return 1

    cdef int initialize_state(self, StateC* st) nogil:
        # This is especially necessary when we use limited training data.
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -51,7 +51,7 @@ from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
 from .._ml import Residual, drop_layer, flatten
 from .._ml import link_vectors_to_models
 from .._ml import HistoryFeatures
-from ..compat import json_dumps
+from ..compat import json_dumps, copy_array

 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
@ -239,13 +239,13 @@ cdef class Parser:
    """
    @classmethod
    def Model(cls, nr_class, **cfg):
-        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 2))
+        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0))
        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
-        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3))
        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
-        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
-        hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
+        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
+        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
        if hist_size >= 1 and depth == 0:
            raise ValueError("Inconsistent hyper-params: "
                "history_feats >= 1 but parser_hidden_depth==0")
@ -789,12 +789,22 @@ cdef class Parser:
        return []

    def add_label(self, label):
+        resized = False
        for action in self.moves.action_types:
            added = self.moves.add_action(action, label)
            if added:
                # Important that the labels be stored as a list! We need the
                # order, or the model goes out of synch
                self.cfg.setdefault('extra_labels', []).append(label)
+                resized = True
+        if self.model not in (True, False, None) and resized:
+            # Weights are stored in (nr_out, nr_in) format, so we're basically
+            # just adding rows here.
+            smaller = self.model[-1]._layers[-1]
+            larger = Affine(self.moves.n_moves, smaller.nI)
+            copy_array(larger.W[:smaller.nO], smaller.W)
+            copy_array(larger.b[:smaller.nO], smaller.b)
+            self.model[-1]._layers[-1] = larger

    def begin_training(self, gold_tuples, pipeline=None, **cfg):
        if 'model' in cfg:
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -0,0 +1,68 @@
+'''Test the ability to add a label to a (potentially trained) parsing model.'''
+from __future__ import unicode_literals
+import pytest
+import numpy.random
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps
+
+from ...attrs import NORM
+from ...gold import GoldParse
+from ...vocab import Vocab
+from ...tokens import Doc
+from ...pipeline import NeuralDependencyParser
+
+numpy.random.seed(0)
+
+
+@pytest.fixture
+def vocab():
+    return Vocab(lex_attr_getters={NORM: lambda s: s})
+
+
+@pytest.fixture
+def parser(vocab):
+    parser = NeuralDependencyParser(vocab)
+    parser.cfg['token_vector_width'] = 4
+    parser.cfg['hidden_width'] = 6
+    parser.cfg['hist_size'] = 0
+    parser.add_label('left')
+    parser.begin_training([], **parser.cfg)
+    sgd = Adam(NumpyOps(), 0.001)
+
+    for i in range(30):
+        losses = {}
+        doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
+        gold = GoldParse(doc, heads=[1, 1, 3, 3],
+                deps=['left', 'ROOT', 'left', 'ROOT'])
+        parser.update([doc], [gold], sgd=sgd, losses=losses)
+    return parser
+
+
+def test_add_label(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc = parser(doc)
+    assert doc[0].head.i == 1
+    assert doc[0].dep_ == 'left'
+    assert doc[1].head.i == 1
+    assert doc[2].head.i == 3
+    assert doc[2].head.i == 3
+    parser.add_label('right')
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc = parser(doc)
+    assert doc[0].head.i == 1
+    assert doc[0].dep_ == 'left'
+    assert doc[1].head.i == 1
+    assert doc[2].head.i == 3
+    assert doc[2].head.i == 3
+    sgd = Adam(NumpyOps(), 0.001)
+    for i in range(10):
+        losses = {}
+        doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+        gold = GoldParse(doc, heads=[1, 1, 3, 3],
+                deps=['right', 'ROOT', 'left', 'ROOT'])
+        parser.update([doc], [gold], sgd=sgd, losses=losses)
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc = parser(doc)
+    assert doc[0].dep_ == 'right'
+    assert doc[2].dep_ == 'left'
+ 
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -35,7 +35,8 @@ def parser(vocab, arc_eager):

@pytest.fixture
 def model(arc_eager, tok2vec):
-    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
+    return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO,
+                        hist_size=0)[0]

@pytest.fixture
 def doc(vocab):
@ -51,7 +52,7 @@ def test_can_init_nn_parser(parser):


 def test_build_model(parser):
-    parser.model = Parser.Model(parser.moves.n_moves)[0]
+    parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0]
    assert parser.model is not None


--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -0,0 +1,73 @@
+'''Test that the parser respects preset sentence boundaries.'''
+from __future__ import unicode_literals
+import pytest
+from thinc.neural.optimizers import Adam
+from thinc.neural.ops import NumpyOps
+
+from ...attrs import NORM
+from ...gold import GoldParse
+from ...vocab import Vocab
+from ...tokens import Doc
+from ...pipeline import NeuralDependencyParser
+
+@pytest.fixture
+def vocab():
+    return Vocab(lex_attr_getters={NORM: lambda s: s})
+
+@pytest.fixture
+def parser(vocab):
+    parser = NeuralDependencyParser(vocab)
+    parser.cfg['token_vector_width'] = 4
+    parser.cfg['hidden_width'] = 32
+    #parser.add_label('right')
+    parser.add_label('left')
+    parser.begin_training([], **parser.cfg)
+    sgd = Adam(NumpyOps(), 0.001)
+
+    for i in range(10):
+        losses = {}
+        doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
+        gold = GoldParse(doc, heads=[1, 1, 3, 3],
+                deps=['left', 'ROOT', 'left', 'ROOT'])
+        parser.update([doc], [gold], sgd=sgd, losses=losses)
+    return parser
+
+def test_no_sentences(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 2
+
+
+def test_sents_1(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[2].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) >= 2
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = False
+    doc[2].sent_start = True
+    doc[3].sent_start = False
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 2
+
+
+def test_sents_1_2(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = True
+    doc[2].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 3
+
+
+def test_sents_1_3(parser):
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = True
+    doc[3].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 4
+    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
+    doc[1].sent_start = True
+    doc[2].sent_start = False
+    doc[3].sent_start = True
+    doc = parser(doc)
+    assert len(list(doc.sents)) == 3
--- a/spacy/tests/serialize/test_serialize_empty_model.py
+++ b/spacy/tests/serialize/test_serialize_empty_model.py
@ -0,0 +1,9 @@
+import spacy
+import spacy.lang.en
+from spacy.pipeline import TextCategorizer
+
+def test_bytes_serialize_issue_1105():
+    nlp = spacy.lang.en.English()
+    tokenizer = nlp.tokenizer
+    textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
+    textcat_bytes = textcat.to_bytes()
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -506,7 +506,7 @@ cdef class Doc:
            cdef int i
            start = 0
            for i in range(1, self.length):
-                if self.c[i].sent_start:
+                if self.c[i].sent_start == 1:
                    yield Span(self, start, i)
                    start = i
            if start != self.length:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -129,6 +129,29 @@ cdef class Span:
    def _(self):
        return Underscore(Underscore.span_extensions, self,
                          start=self.start_char, end=self.end_char)
+    def as_doc(self):
+        '''Create a Doc object view of the Span's data.
+
+        This is mostly useful for C-typed interfaces.
+        '''
+        cdef Doc doc = Doc(self.doc.vocab)
+        doc.length = self.end-self.start
+        doc.c = &self.doc.c[self.start]
+        doc.mem = self.doc.mem
+        doc.is_parsed = self.doc.is_parsed
+        doc.is_tagged = self.doc.is_tagged
+        doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
+        doc.user_hooks = self.doc.user_hooks
+        doc.user_span_hooks = self.doc.user_span_hooks
+        doc.user_token_hooks = self.doc.user_token_hooks
+        doc.vector = self.vector
+        doc.vector_norm = self.vector_norm
+        for key, value in self.doc.cats.items():
+            if hasattr(key, '__len__') and len(key) == 3:
+                cat_start, cat_end, cat_label = key
+                if cat_start == self.start_char and cat_end == self.end_char:
+                    doc.cats[cat_label] = value
+        return doc

    def merge(self, *args, **attributes):
        """Retokenize the document, such that the span is merged into a single
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -300,13 +300,21 @@ cdef class Token:
        def __get__(self):
            return self.c.sent_start

-        def __set__(self, bint value):
+        def __set__(self, value):
            if self.doc.is_parsed:
                raise ValueError(
                    'Refusing to write to token.sent_start if its document is parsed, '
                    'because this may cause inconsistent state. '
                    'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
-            self.c.sent_start = value
+            if value is None:
+                self.c.sent_start = 0
+            elif value is True:
+                self.c.sent_start = 1
+            elif value is False:
+                self.c.sent_start = -1
+            else:
+                raise ValueError("Invalid value for token.sent_start -- must be one of "
+                                 "None, True, False")

    property lefts:
        def __get__(self):