spaCy/spacy/pipeline/pipes.pyx

# cython: infer_types=True
# cython: profile=True
# coding: utf8
from __future__ import unicode_literals

import numpy
import srsly
import random
from collections import OrderedDict
from thinc.api import chain
from thinc.v2v import Affine, Maxout, Softmax
from thinc.misc import LayerNorm
from thinc.neural.util import to_categorical
from thinc.neural.util import get_array_module

from .functions import merge_subtokens
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
from ..syntax.ner cimport BiluoPushDown
from ..syntax.arc_eager cimport ArcEager
from ..morphology cimport Morphology
from ..vocab cimport Vocab

from ..syntax import nonproj
from ..attrs import POS, ID
from ..parts_of_speech import X
from ..kb import KnowledgeBase
from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
from .._ml import build_bow_text_classifier, build_nel_encoder
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
from .._ml import MultiSoftmax, get_characters_loss
from ..errors import Errors, TempErrors, user_warning, Warnings
from .. import util


def _load_cfg(path):
    if path.exists():
        return srsly.read_json(path)
    else:
        return {}


class Pipe(object):
    """This class is not instantiated directly. Components inherit from it, and
    it defines the interface that components should follow to function as
    components in a spaCy analysis pipeline.
    """

    name = None

    @classmethod
    def Model(cls, *shape, **kwargs):
        """Initialize a model for the pipe."""
        raise NotImplementedError

    def __init__(self, vocab, model=True, **cfg):
        """Create a new pipe instance."""
        raise NotImplementedError

    def __call__(self, doc):
        """Apply the pipe to one document. The document is
        modified in-place, and returned.

        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        self.require_model()
        predictions = self.predict([doc])
        if isinstance(predictions, tuple) and len(predictions) == 2:
            scores, tensors = predictions
            self.set_annotations([doc], scores, tensors=tensors)
        else:
            self.set_annotations([doc], predictions)
        return doc

    def require_model(self):
        """Raise an error if the component's model is not initialized."""
        if getattr(self, "model", None) in (None, True, False):
            raise ValueError(Errors.E109.format(name=self.name))

    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.

        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            predictions = self.predict(docs)
            if isinstance(predictions, tuple) and len(tuple) == 2:
                scores, tensors = predictions
                self.set_annotations(docs, scores, tensors=tensors)
            else:
                self.set_annotations(docs, predictions)
            yield from docs

    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without
        modifying them.
        """
        self.require_model()
        raise NotImplementedError

    def set_annotations(self, docs, scores, tensors=None):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError

    def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model.

        Delegates to predict() and get_loss().
        """
        pass

    def rehearse(self, docs, sgd=None, losses=None, **config):
        pass

    def get_loss(self, docs, golds, scores):
        """Find the loss and gradient of loss for the batch of
        documents and their predicted scores."""
        raise NotImplementedError

    def add_label(self, label):
        """Add an output label, to be predicted by the model.

        It's possible to extend pretrained models with new labels,
        but care should be taken to avoid the "catastrophic forgetting"
        problem.
        """
        raise NotImplementedError

    def create_optimizer(self):
        return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))

    def begin_training(
        self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
    ):
        """Initialize the pipe for training, using data exampes if available.
        If no model has been initialized yet, the model is added."""
        if self.model is True:
            self.model = self.Model(**self.cfg)
        if hasattr(self, "vocab"):
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd

    def use_params(self, params):
        """Modify the pipe's model, to use the given parameter values."""
        with self.model.use_params(params):
            yield

    def to_bytes(self, exclude=tuple(), **kwargs):
        """Serialize the pipe to a bytestring.

        exclude (list): String names of serialization fields to exclude.
        RETURNS (bytes): The serialized object.
        """
        serialize = OrderedDict()
        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        if self.model not in (True, False, None):
            serialize["model"] = self.model.to_bytes
        if hasattr(self, "vocab"):
            serialize["vocab"] = self.vocab.to_bytes
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
        """Load the pipe from a bytestring."""

        def load_model(b):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                self.model = self.Model(**self.cfg)
            try:
                self.model.from_bytes(b)
            except AttributeError:
                raise ValueError(Errors.E149)

        deserialize = OrderedDict()
        deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
        if hasattr(self, "vocab"):
            deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
        deserialize["model"] = load_model
        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
        util.from_bytes(bytes_data, deserialize, exclude)
        return self

    def to_disk(self, path, exclude=tuple(), **kwargs):
        """Serialize the pipe to disk."""
        serialize = OrderedDict()
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        if self.model not in (None, True, False):
            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, exclude=tuple(), **kwargs):
        """Load the pipe from disk."""

        def load_model(p):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                self.model = self.Model(**self.cfg)
            try:
                self.model.from_bytes(p.open("rb").read())
            except AttributeError:
                raise ValueError(Errors.E149)

        deserialize = OrderedDict()
        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
        deserialize["model"] = load_model
        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
        util.from_disk(path, deserialize, exclude)
        return self


class Tensorizer(Pipe):
    """Pre-train position-sensitive vectors for tokens."""

    name = "tensorizer"

    @classmethod
    def Model(cls, output_size=300, **cfg):
        """Create a new statistical model for the class.

        width (int): Output size of the model.
        embed_size (int): Number of vectors in the embedding table.
        **cfg: Config parameters.
        RETURNS (Model): A `thinc.neural.Model` or similar instance.
        """
        input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
        return zero_init(Affine(output_size, input_size, drop_factor=0.0))

    def __init__(self, vocab, model=True, **cfg):
        """Construct a new statistical model. Weights are not allocated on
        initialisation.

        vocab (Vocab): A `Vocab` instance. The model must share the same
            `Vocab` instance with the `Doc` objects it will process.
        model (Model): A `Model` instance or `True` to allocate one later.
        **cfg: Config parameters.

        EXAMPLE:
            >>> from spacy.pipeline import TokenVectorEncoder
            >>> tok2vec = TokenVectorEncoder(nlp.vocab)
            >>> tok2vec.model = tok2vec.Model(128, 5000)
        """
        self.vocab = vocab
        self.model = model
        self.input_models = []
        self.cfg = dict(cfg)
        self.cfg.setdefault("cnn_maxout_pieces", 3)

    def __call__(self, doc):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
        model. Vectors are set to the `Doc.tensor` attribute.

        docs (Doc or iterable): One or more documents to add vectors to.
        RETURNS (dict or None): Intermediate computations.
        """
        tokvecses = self.predict([doc])
        self.set_annotations([doc], tokvecses)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Process `Doc` objects as a stream.

        stream (iterator): A sequence of `Doc` objects to process.
        batch_size (int): Number of `Doc` objects to group.
        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
        """
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            tensors = self.predict(docs)
            self.set_annotations(docs, tensors)
            yield from docs

    def predict(self, docs):
        """Return a single tensor for a batch of documents.

        docs (iterable): A sequence of `Doc` objects.
        RETURNS (object): Vector representations for each token in the docs.
        """
        self.require_model()
        inputs = self.model.ops.flatten([doc.tensor for doc in docs])
        outputs = self.model(inputs)
        return self.model.ops.unflatten(outputs, [len(d) for d in docs])

    def set_annotations(self, docs, tensors):
        """Set the tensor attribute for a batch of documents.

        docs (iterable): A sequence of `Doc` objects.
        tensors (object): Vector representation for each token in the docs.
        """
        for doc, tensor in zip(docs, tensors):
            if tensor.shape[0] != len(doc):
                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
            doc.tensor = tensor

    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
        """Update the model.

        docs (iterable): A batch of `Doc` objects.
        golds (iterable): A batch of `GoldParse` objects.
        drop (float): The dropout rate.
        sgd (callable): An optimizer.
        RETURNS (dict): Results from the update.
        """
        self.require_model()
        if isinstance(docs, Doc):
            docs = [docs]
        inputs = []
        bp_inputs = []
        for tok2vec in self.input_models:
            tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
            inputs.append(tensor)
            bp_inputs.append(bp_tensor)
        inputs = self.model.ops.xp.hstack(inputs)
        scores, bp_scores = self.model.begin_update(inputs, drop=drop)
        loss, d_scores = self.get_loss(docs, golds, scores)
        d_inputs = bp_scores(d_scores, sgd=sgd)
        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
        for d_input, bp_input in zip(d_inputs, bp_inputs):
            bp_input(d_input, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        return loss

    def get_loss(self, docs, golds, prediction):
        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
        target = self.vocab.vectors.data[ids]
        d_scores = (prediction - target) / prediction.shape[0]
        loss = (d_scores ** 2).sum()
        return loss, d_scores

    def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
        """Allocate models, pre-process training data and acquire an
        optimizer.

        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
        """
        if pipeline is not None:
            for name, model in pipeline:
                if getattr(model, "tok2vec", None):
                    self.input_models.append(model.tok2vec)
        if self.model is True:
            self.model = self.Model(**self.cfg)
        link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd


class Tagger(Pipe):
    """Pipeline component for part-of-speech tagging.

    DOCS: https://spacy.io/api/tagger
    """

    name = "tagger"

    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self._rehearsal_model = None
        self.cfg = OrderedDict(sorted(cfg.items()))
        self.cfg.setdefault("cnn_maxout_pieces", 2)

    @property
    def labels(self):
        return tuple(self.vocab.morphology.tag_names)

    @property
    def tok2vec(self):
        if self.model in (None, True, False):
            return None
        else:
            return chain(self.model.tok2vec, flatten)

    def __call__(self, doc):
        tags, tokvecs = self.predict([doc])
        self.set_annotations([doc], tags, tensors=tokvecs)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            tag_ids, tokvecs = self.predict(docs)
            self.set_annotations(docs, tag_ids, tensors=tokvecs)
            yield from docs

    def predict(self, docs):
        self.require_model()
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            n_labels = len(self.labels)
            guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
            tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
            return guesses, tokvecs
        tokvecs = self.model.tok2vec(docs)
        scores = self.model.softmax(tokvecs)
        guesses = []
        for doc_scores in scores:
            doc_guesses = doc_scores.argmax(axis=1)
            if not isinstance(doc_guesses, numpy.ndarray):
                doc_guesses = doc_guesses.get()
            guesses.append(doc_guesses)
        return guesses, tokvecs

    def set_annotations(self, docs, batch_tag_ids, tensors=None):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
        cdef int idx = 0
        cdef Vocab vocab = self.vocab
        assign_morphology = self.cfg.get("set_morphology", True)
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                # Don't clobber preset POS tags
                if doc.c[j].tag == 0:
                    if doc.c[j].pos == 0 and assign_morphology:
                        # Don't clobber preset lemmas
                        lemma = doc.c[j].lemma
                        vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
                        if lemma != 0 and lemma != doc.c[j].lex.orth:
                            doc.c[j].lemma = lemma
                    else:
                        doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
                idx += 1
            if tensors is not None and len(tensors):
                if isinstance(doc.tensor, numpy.ndarray) \
                and not isinstance(tensors[i], numpy.ndarray):
                    doc.extend_tensor(tensors[i].get())
                else:
                    doc.extend_tensor(tensors[i])
            doc.is_tagged = True

    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        self.require_model()
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.

        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return

        tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
        bp_tag_scores(d_tag_scores, sgd=sgd)

        if losses is not None:
            losses[self.name] += loss

    def rehearse(self, docs, drop=0., sgd=None, losses=None):
        """Perform a 'rehearsal' update, where we try to match the output of
        an initial model.
        """
        if self._rehearsal_model is None:
            return
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
        guesses, backprop = self.model.begin_update(docs, drop=drop)
        target = self._rehearsal_model(docs)
        gradient = guesses - target
        backprop(gradient, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += (gradient**2).sum()

    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
        tag_index = {tag: i for i, tag in enumerate(self.labels)}
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype="i")
        guesses = scores.argmax(axis=1)
        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
        for gold in golds:
            for tag in gold.tags:
                if tag is None:
                    correct[idx] = guesses[idx]
                elif tag in tag_index:
                    correct[idx] = tag_index[tag]
                else:
                    correct[idx] = 0
                    known_labels[idx] = 0.
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
        d_scores *= self.model.ops.asarray(known_labels)
        loss = (d_scores**2).sum()
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores

    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
                       **kwargs):
        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
        if not any(table in self.vocab.lookups for table in lemma_tables):
            user_warning(Warnings.W022)
        orig_tag_map = dict(self.vocab.morphology.tag_map)
        new_tag_map = OrderedDict()
        for raw_text, annots_brackets in get_gold_tuples():
            _ = annots_brackets.pop()
            for annots, brackets in annots_brackets:
                ids, words, tags, heads, deps, ents = annots
                for tag in tags:
                    if tag in orig_tag_map:
                        new_tag_map[tag] = orig_tag_map[tag]
                    else:
                        new_tag_map[tag] = {POS: X}
        cdef Vocab vocab = self.vocab
        if new_tag_map:
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
                                          vocab.morphology.lemmatizer,
                                          exc=vocab.morphology.exc)
        self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
        if self.model is True:
            for hp in ["token_vector_width", "conv_depth"]:
                if hp in kwargs:
                    self.cfg[hp] = kwargs[hp]
            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
        link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd

    @classmethod
    def Model(cls, n_tags, **cfg):
        if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"):
            raise ValueError(TempErrors.T008)
        return build_tagger_model(n_tags, **cfg)

    def add_label(self, label, values=None):
        if label in self.labels:
            return 0
        if self.model not in (True, False, None):
            # Here's how the model resizing will work, once the
            # neuron-to-tag mapping is no longer controlled by
            # the Morphology class, which sorts the tag names.
            # The sorting makes adding labels difficult.
            # smaller = self.model._layers[-1]
            # larger = Softmax(len(self.labels)+1, smaller.nI)
            # copy_array(larger.W[:smaller.nO], smaller.W)
            # copy_array(larger.b[:smaller.nO], smaller.b)
            # self.model._layers[-1] = larger
            raise ValueError(TempErrors.T003)
        tag_map = dict(self.vocab.morphology.tag_map)
        if values is None:
            values = {POS: "X"}
        tag_map[label] = values
        self.vocab.morphology = Morphology(
            self.vocab.strings, tag_map=tag_map,
            lemmatizer=self.vocab.morphology.lemmatizer,
            exc=self.vocab.morphology.exc)
        return 1

    def use_params(self, params):
        with self.model.use_params(params):
            yield

    def to_bytes(self, exclude=tuple(), **kwargs):
        serialize = OrderedDict()
        if self.model not in (None, True, False):
            serialize["model"] = self.model.to_bytes
        serialize["vocab"] = self.vocab.to_bytes
        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
        serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
        def load_model(b):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                token_vector_width = util.env_opt(
                    "token_vector_width",
                    self.cfg.get("token_vector_width", 96))
                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
            try:
                self.model.from_bytes(b)
            except AttributeError:
                raise ValueError(Errors.E149)

        def load_tag_map(b):
            tag_map = srsly.msgpack_loads(b)
            self.vocab.morphology = Morphology(
                self.vocab.strings, tag_map=tag_map,
                lemmatizer=self.vocab.morphology.lemmatizer,
                exc=self.vocab.morphology.exc)

        deserialize = OrderedDict((
            ("vocab", lambda b: self.vocab.from_bytes(b)),
            ("tag_map", load_tag_map),
            ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
            ("model", lambda b: load_model(b)),
        ))
        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
        util.from_bytes(bytes_data, deserialize, exclude)
        return self

    def to_disk(self, path, exclude=tuple(), **kwargs):
        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
        serialize = OrderedDict((
            ("vocab", lambda p: self.vocab.to_disk(p)),
            ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
            ("model", lambda p: p.open("wb").write(self.model.to_bytes())),
            ("cfg", lambda p: srsly.write_json(p, self.cfg))
        ))
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, exclude=tuple(), **kwargs):
        def load_model(p):
            # TODO: Remove this once we don't have to handle previous models
            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
            with p.open("rb") as file_:
                try:
                    self.model.from_bytes(file_.read())
                except AttributeError:
                    raise ValueError(Errors.E149)

        def load_tag_map(p):
            tag_map = srsly.read_msgpack(p)
            self.vocab.morphology = Morphology(
                self.vocab.strings, tag_map=tag_map,
                lemmatizer=self.vocab.morphology.lemmatizer,
                exc=self.vocab.morphology.exc)

        deserialize = OrderedDict((
            ("cfg", lambda p: self.cfg.update(_load_cfg(p))),
            ("vocab", lambda p: self.vocab.from_disk(p)),
            ("tag_map", load_tag_map),
            ("model", load_model),
        ))
        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
        util.from_disk(path, deserialize, exclude)
        return self


class MultitaskObjective(Tagger):
    """Experimental: Assist training of a parser or tagger, by training a
    side-objective.
    """

    name = "nn_labeller"

    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
        self.vocab = vocab
        self.model = model
        if target == "dep":
            self.make_label = self.make_dep
        elif target == "tag":
            self.make_label = self.make_tag
        elif target == "ent":
            self.make_label = self.make_ent
        elif target == "dep_tag_offset":
            self.make_label = self.make_dep_tag_offset
        elif target == "ent_tag":
            self.make_label = self.make_ent_tag
        elif target == "sent_start":
            self.make_label = self.make_sent_start
        elif hasattr(target, "__call__"):
            self.make_label = target
        else:
            raise ValueError(Errors.E016)
        self.cfg = dict(cfg)
        self.cfg.setdefault("cnn_maxout_pieces", 2)

    @property
    def labels(self):
        return self.cfg.setdefault("labels", {})

    @labels.setter
    def labels(self, value):
        self.cfg["labels"] = value

    def set_annotations(self, docs, dep_ids, tensors=None):
        pass

    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None,
                       sgd=None, **kwargs):
        gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
        for raw_text, annots_brackets in gold_tuples:
            for annots, brackets in annots_brackets:
                ids, words, tags, heads, deps, ents = annots
                for i in range(len(ids)):
                    label = self.make_label(i, words, tags, heads, deps, ents)
                    if label is not None and label not in self.labels:
                        self.labels[label] = len(self.labels)
        if self.model is True:
            token_vector_width = util.env_opt("token_vector_width")
            self.model = self.Model(len(self.labels), tok2vec=tok2vec)
        link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd

    @classmethod
    def Model(cls, n_tags, tok2vec=None, **cfg):
        token_vector_width = util.env_opt("token_vector_width", 96)
        softmax = Softmax(n_tags, token_vector_width*2)
        model = chain(
            tok2vec,
            LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
            softmax
        )
        model.tok2vec = tok2vec
        model.softmax = softmax
        return model

    def predict(self, docs):
        self.require_model()
        tokvecs = self.model.tok2vec(docs)
        scores = self.model.softmax(tokvecs)
        return tokvecs, scores

    def get_loss(self, docs, golds, scores):
        if len(docs) != len(golds):
            raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs),
                                                n_golds=len(golds)))
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype="i")
        guesses = scores.argmax(axis=1)
        for i, gold in enumerate(golds):
            for j in range(len(docs[i])):
                # Handes alignment for tokenization differences
                label = self.make_label(j, gold.words, gold.tags,
                                        gold.heads, gold.labels, gold.ents)
                if label is None or label not in self.labels:
                    correct[idx] = guesses[idx]
                else:
                    correct[idx] = self.labels[label]
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype="i")
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
        loss = (d_scores**2).sum()
        return float(loss), d_scores

    @staticmethod
    def make_dep(i, words, tags, heads, deps, ents):
        if deps[i] is None or heads[i] is None:
            return None
        return deps[i]

    @staticmethod
    def make_tag(i, words, tags, heads, deps, ents):
        return tags[i]

    @staticmethod
    def make_ent(i, words, tags, heads, deps, ents):
        if ents is None:
            return None
        return ents[i]

    @staticmethod
    def make_dep_tag_offset(i, words, tags, heads, deps, ents):
        if deps[i] is None or heads[i] is None:
            return None
        offset = heads[i] - i
        offset = min(offset, 2)
        offset = max(offset, -2)
        return "%s-%s:%d" % (deps[i], tags[i], offset)

    @staticmethod
    def make_ent_tag(i, words, tags, heads, deps, ents):
        if ents is None or ents[i] is None:
            return None
        else:
            return "%s-%s" % (tags[i], ents[i])

    @staticmethod
    def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
        """A multi-task objective for representing sentence boundaries,
        using BILU scheme. (O is impossible)

        The implementation of this method uses an internal cache that relies
        on the identity of the heads array, to avoid requiring a new piece
        of gold data. You can pass cache=False if you know the cache will
        do the wrong thing.
        """
        assert len(words) == len(heads)
        assert target < len(words), (target, len(words))
        if cache:
            if id(heads) in _cache:
                return _cache[id(heads)][target]
            else:
                for key in list(_cache.keys()):
                    _cache.pop(key)
            sent_tags = ["I-SENT"] * len(words)
            _cache[id(heads)] = sent_tags
        else:
            sent_tags = ["I-SENT"] * len(words)

        def _find_root(child):
            seen = set([child])
            while child is not None and heads[child] != child:
                seen.add(child)
                child = heads[child]
            return child

        sentences = {}
        for i in range(len(words)):
            root = _find_root(i)
            if root is None:
                sent_tags[i] = None
            else:
                sentences.setdefault(root, []).append(i)
        for root, span in sorted(sentences.items()):
            if len(span) == 1:
                sent_tags[span[0]] = "U-SENT"
            else:
                sent_tags[span[0]] = "B-SENT"
                sent_tags[span[-1]] = "L-SENT"
        return sent_tags[target]


class ClozeMultitask(Pipe):
    @classmethod
    def Model(cls, vocab, tok2vec, **cfg):
        if cfg["objective"] == "characters":
            out_sizes = [256] * cfg.get("nr_char", 4)
            output_layer = MultiSoftmax(out_sizes)
        else:
            output_size = vocab.vectors.data.shape[1]
            output_layer = chain(
                LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
                zero_init(Affine(output_size, output_size, drop_factor=0.0))
            )
        model = chain(tok2vec, output_layer)
        model = masked_language_model(vocab, model)
        model.tok2vec = tok2vec
        model.output_layer = output_layer
        return model

    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = cfg
        self.cfg.setdefault("objective", "characters")
        self.cfg.setdefault("nr_char", 4)

    def set_annotations(self, docs, dep_ids, tensors=None):
        pass

    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None,
                        tok2vec=None, sgd=None, **kwargs):
        link_vectors_to_models(self.vocab)
        if self.model is True:
            kwargs.update(self.cfg)
            self.model = self.Model(self.vocab, tok2vec, **kwargs)
        X = self.model.ops.allocate((5, self.model.tok2vec.nO))
        self.model.output_layer.begin_training(X)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd

    def predict(self, docs):
        self.require_model()
        tokvecs = self.model.tok2vec(docs)
        vectors = self.model.output_layer(tokvecs)
        return tokvecs, vectors

    def get_loss(self, docs, vectors, prediction):
        if self.cfg["objective"] == "characters":
            loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
        else:
            # The simplest way to implement this would be to vstack the
            # token.vector values, but that's a bit inefficient, especially on GPU.
            # Instead we fetch the index into the vectors table for each of our tokens,
            # and look them up all at once. This prevents data copying.
            ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
            target = vectors[ids]
            loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
        return float(loss), gradient

    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        pass

    def rehearse(self, docs, drop=0., sgd=None, losses=None):
        self.require_model()
        if losses is not None and self.name not in losses:
            losses[self.name] = 0.
        predictions, bp_predictions = self.model.begin_update(docs, drop=drop)
        loss, d_predictions = self.get_loss(docs, self.vocab.vectors.data, predictions)
        bp_predictions(d_predictions, sgd=sgd)

        if losses is not None:
            losses[self.name] += loss

    @staticmethod
    def decode_utf8_predictions(char_array):
        # The format alternates filling from start and end, and 255 is missing
        words = []
        char_array = char_array.reshape((char_array.shape[0], -1, 256))
        nr_char = char_array.shape[1]
        char_array = char_array.argmax(axis=-1)
        for row in char_array:
            starts = [chr(c) for c in row[::2] if c != 255]
            ends = [chr(c) for c in row[1::2] if c != 255]
            word = "".join(starts + list(reversed(ends)))
            words.append(word)
        return words


class TextCategorizer(Pipe):
    """Pipeline component for text classification.

    DOCS: https://spacy.io/api/textcategorizer
    """
    name = 'textcat'

    @classmethod
    def Model(cls, nr_class=1, **cfg):
        embed_size = util.env_opt("embed_size", 2000)
        if "token_vector_width" in cfg:
            token_vector_width = cfg["token_vector_width"]
        else:
            token_vector_width = util.env_opt("token_vector_width", 96)
        if cfg.get("architecture") == "simple_cnn":
            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
            return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
        elif cfg.get("architecture") == "bow":
            return build_bow_text_classifier(nr_class, **cfg)
        else:
            return build_text_classifier(nr_class, **cfg)

    @property
    def tok2vec(self):
        if self.model in (None, True, False):
            return None
        else:
            return self.model.tok2vec

    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self._rehearsal_model = None
        self.cfg = dict(cfg)

    @property
    def labels(self):
        return tuple(self.cfg.setdefault("labels", []))

    def require_labels(self):
        """Raise an error if the component's model has no labels defined."""
        if not self.labels:
            raise ValueError(Errors.E143.format(name=self.name))

    @labels.setter
    def labels(self, value):
        self.cfg["labels"] = tuple(value)

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            scores, tensors = self.predict(docs)
            self.set_annotations(docs, scores, tensors=tensors)
            yield from docs

    def predict(self, docs):
        self.require_model()
        tensors = [doc.tensor for doc in docs]

        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            xp = get_array_module(tensors)
            scores = xp.zeros((len(docs), len(self.labels)))
            return scores, tensors

        scores = self.model(docs)
        scores = self.model.ops.asarray(scores)
        return scores, tensors

    def set_annotations(self, docs, scores, tensors=None):
        for i, doc in enumerate(docs):
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])

    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
        self.require_model()
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
        loss, d_scores = self.get_loss(docs, golds, scores)
        bp_scores(d_scores, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss

    def rehearse(self, docs, drop=0., sgd=None, losses=None):
        if self._rehearsal_model is None:
            return
        if not any(len(doc) for doc in docs):
            # Handle cases where there are no tokens in any docs.
            return
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
        target = self._rehearsal_model(docs)
        gradient = scores - target
        bp_scores(gradient, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += (gradient**2).sum()

    def get_loss(self, docs, golds, scores):
        truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
        not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
        for i, gold in enumerate(golds):
            for j, label in enumerate(self.labels):
                if label in gold.cats:
                    truths[i, j] = gold.cats[label]
                else:
                    not_missing[i, j] = 0.
        truths = self.model.ops.asarray(truths)
        not_missing = self.model.ops.asarray(not_missing)
        d_scores = (scores-truths) / scores.shape[0]
        d_scores *= not_missing
        mean_square_error = (d_scores**2).sum(axis=1).mean()
        return float(mean_square_error), d_scores

    def add_label(self, label):
        if label in self.labels:
            return 0
        if self.model not in (None, True, False):
            # This functionality was available previously, but was broken.
            # The problem is that we resize the last layer, but the last layer
            # is actually just an ensemble. We're not resizing the child layers
            # - a huge problem.
            raise ValueError(Errors.E116)
            # smaller = self.model._layers[-1]
            # larger = Affine(len(self.labels)+1, smaller.nI)
            # copy_array(larger.W[:smaller.nO], smaller.W)
            # copy_array(larger.b[:smaller.nO], smaller.b)
            # self.model._layers[-1] = larger
        self.labels = tuple(list(self.labels) + [label])
        return 1

    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
        for raw_text, (_, (cats, _2)) in get_gold_tuples():
            for cat in cats:
                self.add_label(cat)
        if self.model is True:
            self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
            self.require_labels()
            self.model = self.Model(len(self.labels), **self.cfg)
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd


cdef class DependencyParser(Parser):
    """Pipeline component for dependency parsing.

    DOCS: https://spacy.io/api/dependencyparser
    """

    name = "parser"
    TransitionSystem = ArcEager
    nr_feature = 8

    @property
    def postprocesses(self):
        output = [nonproj.deprojectivize]
        if self.cfg.get("learn_tokens") is True:
            output.append(merge_subtokens)
        return tuple(output)

    def add_multitask_objective(self, target):
        if target == "cloze":
            cloze = ClozeMultitask(self.vocab)
            self._multitasks.append(cloze)
        else:
            labeller = MultitaskObjective(self.vocab, target=target)
            self._multitasks.append(labeller)

    def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
        for labeller in self._multitasks:
            tok2vec = self.model.tok2vec
            labeller.begin_training(get_gold_tuples, pipeline=pipeline,
                                    tok2vec=tok2vec, sgd=sgd)

    def __reduce__(self):
        return (DependencyParser, (self.vocab, self.moves, self.model), None, None)

    @property
    def labels(self):
        labels = set()
        # Get the labels from the model by looking at the available moves
        for move in self.move_names:
            if "-" in move:
                label = move.split("-")[1]
                if "||" in label:
                    label = label.split("||")[1]
                labels.add(label)
        return tuple(sorted(labels))


cdef class EntityRecognizer(Parser):
    """Pipeline component for named entity recognition.

    DOCS: https://spacy.io/api/entityrecognizer
    """

    name = "ner"
    TransitionSystem = BiluoPushDown
    nr_feature = 3

    def add_multitask_objective(self, target):
        if target == "cloze":
            cloze = ClozeMultitask(self.vocab)
            self._multitasks.append(cloze)
        else:
            labeller = MultitaskObjective(self.vocab, target=target)
            self._multitasks.append(labeller)

    def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
        for labeller in self._multitasks:
            tok2vec = self.model.tok2vec
            labeller.begin_training(get_gold_tuples, pipeline=pipeline,
                                    tok2vec=tok2vec)

    def __reduce__(self):
        return (EntityRecognizer, (self.vocab, self.moves, self.model),
                None, None)

    @property
    def labels(self):
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
        labels = set(move.split("-")[1] for move in self.move_names
                     if move[0] in ("B", "I", "L", "U"))
        return tuple(sorted(labels))


class EntityLinker(Pipe):
    """Pipeline component for named entity linking.

    DOCS: https://spacy.io/api/entitylinker
    """
    name = 'entity_linker'
    NIL = "NIL"  # string used to refer to a non-existing link

    @classmethod
    def Model(cls, **cfg):
        embed_width = cfg.get("embed_width", 300)
        hidden_width = cfg.get("hidden_width", 128)
        type_to_int = cfg.get("type_to_int", dict())

        model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
        return model

    def __init__(self, vocab, **cfg):
        self.vocab = vocab
        self.model = True
        self.kb = None
        self.cfg = dict(cfg)

    def set_kb(self, kb):
        self.kb = kb

    def require_model(self):
        # Raise an error if the component's model is not initialized.
        if getattr(self, "model", None) in (None, True, False):
            raise ValueError(Errors.E109.format(name=self.name))

    def require_kb(self):
        # Raise an error if the knowledge base is not initialized.
        if getattr(self, "kb", None) in (None, True, False):
            raise ValueError(Errors.E139.format(name=self.name))

    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
        self.require_kb()
        self.cfg["entity_width"] = self.kb.entity_vector_length

        if self.model is True:
            self.model = self.Model(**self.cfg)

        if sgd is None:
            sgd = self.create_optimizer()

        return sgd

    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
        self.require_model()
        self.require_kb()

        if losses is not None:
            losses.setdefault(self.name, 0.0)

        if not docs or not golds:
            return 0

        if len(docs) != len(golds):
            raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
                                                n_golds=len(golds)))

        if isinstance(docs, Doc):
            docs = [docs]
            golds = [golds]

        context_docs = []

        for doc, gold in zip(docs, golds):
            ents_by_offset = dict()
            for ent in doc.ents:
                ents_by_offset["{}_{}".format(ent.start_char, ent.end_char)] = ent
            for entity, kb_dict in gold.links.items():
                start, end = entity
                mention = doc.text[start:end]

                for kb_id, value in kb_dict.items():
                    # Currently only training on the positive instances
                    if value:
                        context_docs.append(doc)

        context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
        loss, d_scores = self.get_similarity_loss(scores=context_encodings, golds=golds, docs=None)
        bp_context(d_scores, sgd=sgd)

        if losses is not None:
            losses[self.name] += loss
        return loss

    def get_similarity_loss(self, docs, golds, scores):
        entity_encodings = []
        for gold in golds:
            for entity, kb_dict in gold.links.items():
                for kb_id, value in kb_dict.items():
                    # this loss function assumes we're only using positive examples
                    if value:
                        entity_encoding = self.kb.get_vector(kb_id)
                        entity_encodings.append(entity_encoding)

        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")

        if scores.shape != entity_encodings.shape:
            raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))

        loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings)
        loss = loss / len(entity_encodings)
        return loss, gradients

    def get_loss(self, docs, golds, scores):
        cats = []
        for gold in golds:
            for entity, kb_dict in gold.links.items():
                for kb_id, value in kb_dict.items():
                    cats.append([value])

        cats = self.model.ops.asarray(cats, dtype="float32")
        if len(scores) != len(cats):
            raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))

        d_scores = (scores - cats)
        loss = (d_scores ** 2).sum()
        loss = loss / len(cats)
        return loss, d_scores

    def __call__(self, doc):
        kb_ids, tensors = self.predict([doc])
        self.set_annotations([doc], kb_ids, tensors=tensors)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
            kb_ids, tensors = self.predict(docs)
            self.set_annotations(docs, kb_ids, tensors=tensors)
            yield from docs

    def predict(self, docs):
        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
        self.require_model()
        self.require_kb()

        entity_count = 0
        final_kb_ids = []
        final_tensors = []

        if not docs:
            return final_kb_ids, final_tensors

        if isinstance(docs, Doc):
            docs = [docs]

        context_encodings = self.model(docs)
        xp = get_array_module(context_encodings)

        for i, doc in enumerate(docs):
            if len(doc) > 0:
                # currently, the context is the same for each entity in a sentence (should be refined)
                context_encoding = context_encodings[i]
                context_enc_t = context_encoding.T
                norm_1 = xp.linalg.norm(context_enc_t)
                for ent in doc.ents:
                    entity_count += 1

                    candidates = self.kb.get_candidates(ent.text)
                    if not candidates:
                        final_kb_ids.append(self.NIL)  # no prediction possible for this entity
                        final_tensors.append(context_encoding)
                    else:
                        random.shuffle(candidates)

                        # this will set all prior probabilities to 0 if they should be excluded from the model
                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
                        if not self.cfg.get("incl_prior", True):
                            prior_probs = xp.asarray([0.0 for c in candidates])
                        scores = prior_probs

                        # add in similarity from the context
                        if self.cfg.get("incl_context", True):
                            entity_encodings = xp.asarray([c.entity_vector for c in candidates])
                            norm_2 = xp.linalg.norm(entity_encodings, axis=1)

                            if len(entity_encodings) != len(prior_probs):
                                raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length"))

                             # cosine similarity
                            sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
                            if sims.shape != prior_probs.shape:
                                raise ValueError(Errors.E161)
                            scores = prior_probs + sims - (prior_probs*sims)

                        # TODO: thresholding
                        best_index = scores.argmax()
                        best_candidate = candidates[best_index]
                        final_kb_ids.append(best_candidate.entity_)
                        final_tensors.append(context_encoding)

        if not (len(final_tensors) == len(final_kb_ids) == entity_count):
            raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))

        return final_kb_ids, final_tensors

    def set_annotations(self, docs, kb_ids, tensors=None):
        count_ents = len([ent for doc in docs for ent in doc.ents])
        if count_ents != len(kb_ids):
            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))

        i=0
        for doc in docs:
            for ent in doc.ents:
                kb_id = kb_ids[i]
                i += 1
                for token in ent:
                    token.ent_kb_id_ = kb_id

    def to_disk(self, path, exclude=tuple(), **kwargs):
        serialize = OrderedDict()
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        serialize["kb"] = lambda p: self.kb.dump(p)
        if self.model not in (None, True, False):
            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, exclude=tuple(), **kwargs):
        def load_model(p):
            if self.model is True:
                self.model = self.Model(**self.cfg)
            try:
                self.model.from_bytes(p.open("rb").read())
            except AttributeError:
                raise ValueError(Errors.E149)

        def load_kb(p):
            kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
            kb.load_bulk(p)
            self.set_kb(kb)

        deserialize = OrderedDict()
        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
        deserialize["kb"] = load_kb
        deserialize["model"] = load_model
        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
        util.from_disk(path, deserialize, exclude)
        return self

    def rehearse(self, docs, sgd=None, losses=None, **config):
        raise NotImplementedError

    def add_label(self, label):
        raise NotImplementedError


class Sentencizer(object):
    """Segment the Doc into sentences using a rule-based strategy.

    DOCS: https://spacy.io/api/sentencizer
    """

    name = "sentencizer"
    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
            '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
            '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
            '﹖', '﹗', '！', '．', '？', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
            '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
            '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
            '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
            '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈']

    def __init__(self, punct_chars=None, **kwargs):
        """Initialize the sentencizer.

        punct_chars (list): Punctuation characters to split on. Will be
            serialized with the nlp object.
        RETURNS (Sentencizer): The sentencizer component.

        DOCS: https://spacy.io/api/sentencizer#init
        """
        if punct_chars:
            self.punct_chars = set(punct_chars)
        else:
            self.punct_chars = set(self.default_punct_chars)

    def __call__(self, doc):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.

        doc (Doc): The document to process.
        RETURNS (Doc): The processed Doc.

        DOCS: https://spacy.io/api/sentencizer#call
        """
        start = 0
        seen_period = False
        for i, token in enumerate(doc):
            is_in_punct_chars = token.text in self.punct_chars
            token.is_sent_start = i == 0
            if seen_period and not token.is_punct and not is_in_punct_chars:
                doc[start].is_sent_start = True
                start = token.i
                seen_period = False
            elif is_in_punct_chars:
                seen_period = True
        if start < len(doc):
            doc[start].is_sent_start = True
        return doc

    def to_bytes(self, **kwargs):
        """Serialize the sentencizer to a bytestring.

        RETURNS (bytes): The serialized object.

        DOCS: https://spacy.io/api/sentencizer#to_bytes
        """
        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})

    def from_bytes(self, bytes_data, **kwargs):
        """Load the sentencizer from a bytestring.

        bytes_data (bytes): The data to load.
        returns (Sentencizer): The loaded object.

        DOCS: https://spacy.io/api/sentencizer#from_bytes
        """
        cfg = srsly.msgpack_loads(bytes_data)
        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
        return self

    def to_disk(self, path, exclude=tuple(), **kwargs):
        """Serialize the sentencizer to disk.

        DOCS: https://spacy.io/api/sentencizer#to_disk
        """
        path = util.ensure_path(path)
        path = path.with_suffix(".json")
        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})


    def from_disk(self, path, exclude=tuple(), **kwargs):
        """Load the sentencizer from disk.

        DOCS: https://spacy.io/api/sentencizer#from_disk
        """
        path = util.ensure_path(path)
        path = path.with_suffix(".json")
        cfg = srsly.read_json(path)
        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
        return self


__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
-												Update draft of parser neural network model

Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.

Outline of the model:

We first predict context-sensitive vectors for each word in the input:

(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4

This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).

The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.

The current context tokens:

* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0

This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).

The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)

This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.

Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:

(exp(score) / Z) - (exp(score) / gZ)

Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.

Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.

											
										
										
											2017-05-13 00:09:15 +03:00
+								# cython: infer_types=True
 								# cython: profile=True
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 13:05:47 +03:00
+								# coding: utf8
 								from __future__ import unicode_literals
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								import numpy
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								import srsly
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
+								import random
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								from collections import OrderedDict
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								from thinc.api import chain
-												Improve parser multi-task objective

											
										
										
											2018-12-01 16:41:24 +03:00
+								from thinc.v2v import Affine, Maxout, Softmax
 								from thinc.misc import LayerNorm
-												first tests with EL pipe

											
										
										
											2019-06-10 22:25:26 +03:00
+								from thinc.neural.util import to_categorical
-												improve speed of prediction loop

											
										
										
											2019-06-26 14:53:10 +03:00
+								from thinc.neural.util import get_array_module
-												Add built-in factories for merge_entities and merge_noun_chunks

Allows adding those components to the pipeline out-of-the-box if they're defined in a model's meta.json. Also allows usage as nlp.add_pipe(nlp.create_pipe('merge_entities')).

											
										
										
											2018-03-15 02:18:51 +03:00
-												Add merge_subtokens as parser post-process. Re #3830

											
										
										
											2019-06-07 21:40:41 +03:00
+								from .functions import merge_subtokens
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								from ..tokens.doc cimport Doc
 								from ..syntax.nn_parser cimport Parser
 								from ..syntax.ner cimport BiluoPushDown
 								from ..syntax.arc_eager cimport ArcEager
 								from ..morphology cimport Morphology
 								from ..vocab cimport Vocab
-												💫 Rule-based NER component (#2513)

* Add helper function for reading in JSONL

* Add rule-based NER component

* Fix whitespace

* Add component to factories

* Add tests

* Add option to disable indent on json_dumps compat

Otherwise, reading JSONL back in line by line won't work

* Fix error code

											
										
										
											2018-07-18 20:43:16 +03:00
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								from ..syntax import nonproj
 								from ..attrs import POS, ID
 								from ..parts_of_speech import X
-												Fix absolute imports and avoid importing from cli

											
										
										
											2019-08-20 16:08:59 +03:00
+								from ..kb import KnowledgeBase
 								from .._ml import Tok2Vec, build_tagger_model, cosine, get_cossim_loss
-												Default to former TextCategorizer model

* Keep TextCategorizer default model same as v2.0
* Add option 'architecture' that allows "simple_cnn" to switch to
simpler model.
* Add option exclusive_classes, defaulting to False. If set to True,
the model treats classes as mutually exclusive, i.e. only one class can
be true per instance.

											
										
										
											2019-02-23 13:55:16 +03:00
+								from .._ml import build_text_classifier, build_simple_cnn_text_classifier
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
+								from .._ml import build_bow_text_classifier, build_nel_encoder
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								from .._ml import link_vectors_to_models, zero_init, flatten
-												Use cosine loss in Cloze multitask

											
										
										
											2019-10-06 20:23:46 +03:00
+								from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
-												Use chars loss in ClozeMultitask

											
										
										
											2019-10-20 18:47:15 +03:00
+								from .._ml import MultiSoftmax, get_characters_loss
-												Warn in Tagger.begin_training if no lemma tables are available (#4351)


											
										
										
											2019-10-01 16:13:55 +03:00
+								from ..errors import Errors, TempErrors, user_warning, Warnings
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								from .. import util
-												💫 Rule-based NER component (#2513)

* Add helper function for reading in JSONL

* Add rule-based NER component

* Fix whitespace

* Add component to factories

* Add tests

* Add option to disable indent on json_dumps compat

Otherwise, reading JSONL back in line by line won't work

* Fix error code

											
										
										
											2018-07-18 20:43:16 +03:00
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								def _load_cfg(path):
 								    if path.exists():
 								        return srsly.read_json(path)
 								    else:
 								        return {}
-												💫 Rule-based NER component (#2513)

* Add helper function for reading in JSONL

* Add rule-based NER component

* Fix whitespace

* Add component to factories

* Add tests

* Add option to disable indent on json_dumps compat

Otherwise, reading JSONL back in line by line won't work

* Fix error code

											
										
										
											2018-07-18 20:43:16 +03:00
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
-												Rename BaseThincComponent --> Pipe

											
										
										
											2017-10-26 13:40:40 +03:00
+								class Pipe(object):
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								    """This class is not instantiated directly. Components inherit from it, and
 								    it defines the interface that components should follow to function as
 								    components in a spaCy analysis pipeline.
 								    """
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								    name = None
 								    @classmethod
 								    def Model(cls, *shape, **kwargs):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Initialize a model for the pipe."""
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        raise NotImplementedError
 								    def __init__(self, vocab, model=True, **cfg):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Create a new pipe instance."""
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        raise NotImplementedError
 								    def __call__(self, doc):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Apply the pipe to one document. The document is
-												Add docstrings for Pipe API

											
										
										
											2017-09-25 17:20:49 +03:00
+								        modified in-place, and returned.
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
-												Add docstrings for Pipe API

											
										
										
											2017-09-25 17:20:49 +03:00
+								        Both __call__ and pipe should delegate to the `predict()`
 								        and `set_annotations()` methods.
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Fix Pipe base class

											
										
										
											2019-08-01 18:29:01 +03:00
+								        predictions = self.predict([doc])
-												Fix iss4278 (#4279)

* fix: len(tuple) == 2

* (#4278) add fail test

* add contributor's aggreement

											
										
										
											2019-09-12 11:44:49 +03:00
+								        if isinstance(predictions, tuple) and len(predictions) == 2:
-												Fix Pipe base class

											
										
										
											2019-08-01 18:29:01 +03:00
+								            scores, tensors = predictions
-												remove redundant __call__ method in pipes.TextCategorizer (#4305)

* remove redundant __call__ method in pipes.TextCategorizer

Because the parent __call__ method behaves in the same way.

* fix: Pipe.__call__ arg

* fix: invalid arg in Pipe.__call__

* modified:   spacy/tests/regression/test_issue4278.py (#4278)

* deleted:    Pipfile

											
										
										
											2019-09-18 22:31:27 +03:00
+								            self.set_annotations([doc], scores, tensors=tensors)
-												Fix Pipe base class

											
										
										
											2019-08-01 18:29:01 +03:00
+								        else:
 								            self.set_annotations([doc], predictions)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        return doc
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								    def require_model(self):
 								        """Raise an error if the component's model is not initialized."""
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        if getattr(self, "model", None) in (None, True, False):
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								            raise ValueError(Errors.E109.format(name=self.name))
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								    def pipe(self, stream, batch_size=128, n_threads=-1):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Apply the pipe to a stream of documents.
-												Add docstrings for Pipe API

											
										
										
											2017-09-25 17:20:49 +03:00
 								        Both __call__ and pipe should delegate to the `predict()`
 								        and `set_annotations()` methods.
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """
-												Remove cytoolz usage from spaCy

											
										
										
											2018-12-03 04:19:12 +03:00
+								        for docs in util.minibatch(stream, size=batch_size):
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								            docs = list(docs)
-												Make pipe base class a bit less presumptuous

											
										
										
											2019-07-28 18:56:11 +03:00
+								            predictions = self.predict(docs)
 								            if isinstance(predictions, tuple) and len(tuple) == 2:
 								                scores, tensors = predictions
-												remove redundant __call__ method in pipes.TextCategorizer (#4305)

* remove redundant __call__ method in pipes.TextCategorizer

Because the parent __call__ method behaves in the same way.

* fix: Pipe.__call__ arg

* fix: invalid arg in Pipe.__call__

* modified:   spacy/tests/regression/test_issue4278.py (#4278)

* deleted:    Pipfile

											
										
										
											2019-09-18 22:31:27 +03:00
+								                self.set_annotations(docs, scores, tensors=tensors)
-												Make pipe base class a bit less presumptuous

											
										
										
											2019-07-28 18:56:11 +03:00
+								            else:
 								                self.set_annotations(docs, predictions)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								            yield from docs
 								    def predict(self, docs):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Apply the pipeline's model to a batch of docs, without
-												Add docstrings for Pipe API

											
										
										
											2017-09-25 17:20:49 +03:00
+								        modifying them.
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        raise NotImplementedError
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								    def set_annotations(self, docs, scores, tensors=None):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Modify a batch of documents, using pre-computed scores."""
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        raise NotImplementedError
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								    def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Learn from a batch of documents and gold-standard information,
-												Add docstrings for Pipe API

											
										
										
											2017-09-25 17:20:49 +03:00
+								        updating the pipe's model.
 								        Delegates to predict() and get_loss().
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """
-												Don't raise NotImplemented in Pipe.update

											
										
										
											2019-07-28 18:54:11 +03:00
+								        pass
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								    def rehearse(self, docs, sgd=None, losses=None, **config):
 								        pass
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								    def get_loss(self, docs, golds, scores):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Find the loss and gradient of loss for the batch of
 								        documents and their predicted scores."""
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        raise NotImplementedError
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
+								    def add_label(self, label):
 								        """Add an output label, to be predicted by the model.
-												Use consistent spelling

											
										
										
											2019-10-02 11:37:39 +03:00
+								        It's possible to extend pretrained models with new labels,
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
+								        but care should be taken to avoid the "catastrophic forgetting"
 								        problem.
 								        """
 								        raise NotImplementedError
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								    def create_optimizer(self):
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								    def begin_training(
 								        self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
 								    ):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Initialize the pipe for training, using data exampes if available.
 								        If no model has been initialized yet, the model is added."""
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        if self.model is True:
-												Add docstrings for Pipe API

											
										
										
											2017-09-25 17:20:49 +03:00
+								            self.model = self.Model(**self.cfg)
-												Make pipe base class a bit less presumptuous

											
										
										
											2019-07-28 18:56:11 +03:00
+								        if hasattr(self, "vocab"):
 								            link_vectors_to_models(self.vocab)
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								        if sgd is None:
 								            sgd = self.create_optimizer()
 								        return sgd
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
 								    def use_params(self, params):
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								        """Modify the pipe's model, to use the given parameter values."""
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        with self.model.use_params(params):
 								            yield
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_bytes(self, exclude=tuple(), **kwargs):
 								        """Serialize the pipe to a bytestring.
 								        exclude (list): String names of serialization fields to exclude.
 								        RETURNS (bytes): The serialized object.
 								        """
-												Patch serialization bug raised in #1105

											
										
										
											2017-10-10 04:58:12 +03:00
+								        serialize = OrderedDict()
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-												Fix Pipe.to_bytes() when model uninitialized

Closes #3289

											
										
										
											2019-02-21 11:42:02 +03:00
+								        if self.model not in (True, False, None):
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								            serialize["model"] = self.model.to_bytes
-												Make pipe base class a bit less presumptuous

											
										
										
											2019-07-28 18:56:11 +03:00
+								        if hasattr(self, "vocab"):
 								            serialize["vocab"] = self.vocab.to_bytes
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        return util.to_bytes(serialize, exclude)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Load the pipe from a bytestring."""
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
-												Fix textcat serialization

											
										
										
											2017-09-02 16:17:20 +03:00
+								        def load_model(b):
-												Fix loading of multiple pre-trained vectors

This patch addresses #1660, which was caused by keying all pre-trained
vectors with the same ID when telling Thinc how to refer to them. This
meant that if multiple models were loaded that had pre-trained vectors,
errors or incorrect behaviour resulted.

The vectors class now includes a .name attribute, which defaults to:
{nlp.meta['lang']_nlp.meta['name']}.vectors
The vectors name is set in the cfg of the pipeline components under the
key pretrained_vectors. This replaces the previous cfg key
pretrained_dims.

In order to make existing models compatible with this change, we check
for the pretrained_dims key when loading models in from_disk and
from_bytes, and add the cfg key pretrained_vectors if we find it.

											
										
										
											2018-03-28 17:02:59 +03:00
+								            # TODO: Remove this once we don't have to handle previous models
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
 								                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
-												Fix textcat serialization

											
										
										
											2017-09-02 16:17:20 +03:00
+								            if self.model is True:
 								                self.model = self.Model(**self.cfg)
-												💫 Improve error message when model.from_bytes() dies (#4014)

* Improve error message when model.from_bytes() dies

When Thinc's model.from_bytes() is called with a mismatched model, often
we get a particularly ungraceful error,

e.g. "AttributeError: FunctionLayer has no attribute G"

This is because we're trying to load the parameters for something like
a LayerNorm layer, and the model architecture has some other layer there
instead. This is obviously terrible, especially since the error *type*
is wrong.

I've changed it to raise a ValueError. The error message is still
probably a bit terse, but it's hard to be sure exactly what's gone
wrong.

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-07-24 12:27:34 +03:00
+								            try:
 								                self.model.from_bytes(b)
 								            except AttributeError:
 								                raise ValueError(Errors.E149)
-												Fix textcat serialization

											
										
										
											2017-09-02 16:17:20 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        deserialize = OrderedDict()
 								        deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
-												Make pipe base class a bit less presumptuous

											
										
										
											2019-07-28 18:56:11 +03:00
+								        if hasattr(self, "vocab"):
 								            deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        deserialize["model"] = load_model
 								        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        util.from_bytes(bytes_data, deserialize, exclude)
 								        return self
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_disk(self, path, exclude=tuple(), **kwargs):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Serialize the pipe to disk."""
-												Patch serialization bug raised in #1105

											
										
										
											2017-10-10 04:58:12 +03:00
+								        serialize = OrderedDict()
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
 								        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
-												Patch serialization bug raised in #1105

											
										
										
											2017-10-10 04:58:12 +03:00
+								        if self.model not in (None, True, False):
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        util.to_disk(path, serialize, exclude)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_disk(self, path, exclude=tuple(), **kwargs):
-												Fix formatting

											
										
										
											2017-09-25 19:37:13 +03:00
+								        """Load the pipe from disk."""
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
-												Fix textcat serialization

											
										
										
											2017-09-02 16:17:20 +03:00
+								        def load_model(p):
-												Fix loading of multiple pre-trained vectors

This patch addresses #1660, which was caused by keying all pre-trained
vectors with the same ID when telling Thinc how to refer to them. This
meant that if multiple models were loaded that had pre-trained vectors,
errors or incorrect behaviour resulted.

The vectors class now includes a .name attribute, which defaults to:
{nlp.meta['lang']_nlp.meta['name']}.vectors
The vectors name is set in the cfg of the pipeline components under the
key pretrained_vectors. This replaces the previous cfg key
pretrained_dims.

In order to make existing models compatible with this change, we check
for the pretrained_dims key when loading models in from_disk and
from_bytes, and add the cfg key pretrained_vectors if we find it.

											
										
										
											2018-03-28 17:02:59 +03:00
+								            # TODO: Remove this once we don't have to handle previous models
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
 								                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
-												Fix textcat serialization

											
										
										
											2017-09-02 16:17:20 +03:00
+								            if self.model is True:
 								                self.model = self.Model(**self.cfg)
-												💫 Improve error message when model.from_bytes() dies (#4014)

* Improve error message when model.from_bytes() dies

When Thinc's model.from_bytes() is called with a mismatched model, often
we get a particularly ungraceful error,

e.g. "AttributeError: FunctionLayer has no attribute G"

This is because we're trying to load the parameters for something like
a LayerNorm layer, and the model architecture has some other layer there
instead. This is obviously terrible, especially since the error *type*
is wrong.

I've changed it to raise a ValueError. The error message is still
probably a bit terse, but it's hard to be sure exactly what's gone
wrong.

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-07-24 12:27:34 +03:00
+								            try:
 								                self.model.from_bytes(p.open("rb").read())
 								            except AttributeError:
 								                raise ValueError(Errors.E149)
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        deserialize = OrderedDict()
 								        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
 								        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
 								        deserialize["model"] = load_model
 								        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        util.from_disk(path, deserialize, exclude)
 								        return self
-												Rename BaseThincComponent --> Pipe

											
										
										
											2017-10-26 13:40:40 +03:00
+								class Tensorizer(Pipe):
-												Fix dropout in tensorizer, update comment

											
										
										
											2018-11-03 15:46:58 +03:00
+								    """Pre-train position-sensitive vectors for tokens."""
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
 								    name = "tensorizer"
-												Update draft of parser neural network model

Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.

Outline of the model:

We first predict context-sensitive vectors for each word in the input:

(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4

This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).

The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.

The current context tokens:

* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0

This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).

The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)

This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.

Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:

(exp(score) / Z) - (exp(score) / gZ)

Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.

Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.

											
										
										
											2017-05-13 00:09:15 +03:00
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 22:46:08 +03:00
+								    @classmethod
-												Improve Tensorizer

											
										
										
											2018-11-03 13:52:50 +03:00
+								    def Model(cls, output_size=300, **cfg):
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """Create a new statistical model for the class.
 								        width (int): Output size of the model.
 								        embed_size (int): Number of vectors in the embedding table.
 								        **cfg: Config parameters.
 								        RETURNS (Model): A `thinc.neural.Model` or similar instance.
 								        """
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
-												Fix dropout in tensorizer, update comment

											
										
										
											2018-11-03 15:46:58 +03:00
+								        return zero_init(Affine(output_size, input_size, drop_factor=0.0))
-												Update draft of parser neural network model

Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.

Outline of the model:

We first predict context-sensitive vectors for each word in the input:

(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4

This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).

The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.

The current context tokens:

* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0

This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).

The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)

This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.

Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:

(exp(score) / Z) - (exp(score) / gZ)

Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.

Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.

											
										
										
											2017-05-13 00:09:15 +03:00
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 22:46:08 +03:00
+								    def __init__(self, vocab, model=True, **cfg):
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """Construct a new statistical model. Weights are not allocated on
 								        initialisation.
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								        vocab (Vocab): A `Vocab` instance. The model must share the same
 								            `Vocab` instance with the `Doc` objects it will process.
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
+								        model (Model): A `Model` instance or `True` to allocate one later.
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        **cfg: Config parameters.
 								        EXAMPLE:
 								            >>> from spacy.pipeline import TokenVectorEncoder
 								            >>> tok2vec = TokenVectorEncoder(nlp.vocab)
 								            >>> tok2vec.model = tok2vec.Model(128, 5000)
 								        """
-												Improve integration of NN parser, to support unified training API

											
										
										
											2017-05-15 22:46:08 +03:00
+								        self.vocab = vocab
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
+								        self.model = model
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								        self.input_models = []
-												Add cfg attr to pipeline components

											
										
										
											2017-07-23 01:52:47 +03:00
+								        self.cfg = dict(cfg)
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        self.cfg.setdefault("cnn_maxout_pieces", 3)
-												Bug fix to tagger: wasnt backproping to token vectors

											
										
										
											2017-05-17 14:13:14 +03:00
-												Fix __call__ method

											
										
										
											2017-05-28 16:11:58 +03:00
+								    def __call__(self, doc):
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
 								        model. Vectors are set to the `Doc.tensor` attribute.
 								        docs (Doc or iterable): One or more documents to add vectors to.
 								        RETURNS (dict or None): Intermediate computations.
 								        """
-												Fix __call__ method

											
										
										
											2017-05-28 16:11:58 +03:00
+								        tokvecses = self.predict([doc])
 								        self.set_annotations([doc], tokvecses)
 								        return doc
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												Fix use_params and pipe methods

											
										
										
											2017-05-18 16:30:59 +03:00
+								    def pipe(self, stream, batch_size=128, n_threads=-1):
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """Process `Doc` objects as a stream.
 								        stream (iterator): A sequence of `Doc` objects to process.
 								        batch_size (int): Number of `Doc` objects to group.
-												Merge docstrings

											
										
										
											2017-05-21 21:46:23 +03:00
+								        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """
-												Remove cytoolz usage from spaCy

											
										
										
											2018-12-03 04:19:12 +03:00
+								        for docs in util.minibatch(stream, size=batch_size):
-												Fix prediction for tok2vec

											
										
										
											2017-05-22 01:52:01 +03:00
+								            docs = list(docs)
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								            tensors = self.predict(docs)
 								            self.set_annotations(docs, tensors)
-												Remove state argument in pipeline. Other changes

											
										
										
											2017-05-19 21:26:36 +03:00
+								            yield from docs
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
+								    def predict(self, docs):
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """Return a single tensor for a batch of documents.
 								        docs (iterable): A sequence of `Doc` objects.
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								        RETURNS (object): Vector representations for each token in the docs.
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								        inputs = self.model.ops.flatten([doc.tensor for doc in docs])
 								        outputs = self.model(inputs)
 								        return self.model.ops.unflatten(outputs, [len(d) for d in docs])
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								    def set_annotations(self, docs, tensors):
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """Set the tensor attribute for a batch of documents.
 								        docs (iterable): A sequence of `Doc` objects.
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								        tensors (object): Vector representation for each token in the docs.
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								        for doc, tensor in zip(docs, tensors):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            if tensor.shape[0] != len(doc):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								            doc.tensor = tensor
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        """Update the model.
 								        docs (iterable): A batch of `Doc` objects.
 								        golds (iterable): A batch of `GoldParse` objects.
-												debugging

											
										
										
											2019-05-17 18:44:11 +03:00
+								        drop (float): The dropout rate.
-												Change "function" to "callable" in docs

											
										
										
											2017-05-21 14:17:40 +03:00
+								        sgd (callable): An optimizer.
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        RETURNS (dict): Results from the update.
 								        """
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
+								        if isinstance(docs, Doc):
 								            docs = [docs]
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								        inputs = []
 								        bp_inputs = []
 								        for tok2vec in self.input_models:
 								            tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
 								            inputs.append(tensor)
 								            bp_inputs.append(bp_tensor)
 								        inputs = self.model.ops.xp.hstack(inputs)
 								        scores, bp_scores = self.model.begin_update(inputs, drop=drop)
 								        loss, d_scores = self.get_loss(docs, golds, scores)
 								        d_inputs = bp_scores(d_scores, sgd=sgd)
 								        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
-												tensorizer return parameter fix

											
										
										
											2017-11-05 14:25:10 +03:00
+								        for d_input, bp_input in zip(d_inputs, bp_inputs):
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								            bp_input(d_input, sgd=sgd)
 								        if losses is not None:
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								            losses.setdefault(self.name, 0.0)
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								            losses[self.name] += loss
 								        return loss
 								    def get_loss(self, docs, golds, prediction):
-												Improve Tensorizer

											
										
										
											2018-11-03 13:52:50 +03:00
+								        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
 								        target = self.vocab.vectors.data[ids]
-												Normalize gradient by number of words in tensorizer

											
										
										
											2018-11-03 13:53:22 +03:00
+								        d_scores = (prediction - target) / prediction.shape[0]
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        loss = (d_scores ** 2).sum()
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								        return loss, d_scores
-												Data running through, likely errors in model

											
										
										
											2017-05-06 15:22:20 +03:00
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								    def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								        """Allocate models, pre-process training data and acquire an
-												Document TokenVectorEncoder

											
										
										
											2017-05-19 01:00:02 +03:00
+								        optimizer.
 								        gold_tuples (iterable): Gold-standard training data.
 								        pipeline (list): The pipeline the model is part of.
 								        """
-												Fix tensorizer

											
										
										
											2018-11-03 01:51:37 +03:00
+								        if pipeline is not None:
 								            for name, model in pipeline:
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								                if getattr(model, "tok2vec", None):
-												Fix tensorizer

											
										
										
											2018-11-03 01:51:37 +03:00
+								                    self.input_models.append(model.tok2vec)
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
+								        if self.model is True:
-												Pass values for CNN maxout pieces option

											
										
										
											2017-09-21 03:15:49 +03:00
+								            self.model = self.Model(**self.cfg)
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								        link_vectors_to_models(self.vocab)
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								        if sgd is None:
 								            sgd = self.create_optimizer()
 								        return sgd
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
-												Work on serialization for models

											
										
										
											2017-05-29 02:37:57 +03:00
-												Rename BaseThincComponent --> Pipe

											
										
										
											2017-10-26 13:40:40 +03:00
+								class Tagger(Pipe):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    """Pipeline component for part-of-speech tagging.
 								    DOCS: https://spacy.io/api/tagger
 								    """
 								    name = "tagger"
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
-												Add cfg attr to pipeline components

											
										
										
											2017-07-23 01:52:47 +03:00
+								    def __init__(self, vocab, model=True, **cfg):
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
+								        self.vocab = vocab
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        self.model = model
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        self._rehearsal_model = None
-												Try to fix python3.5 serialization

											
										
										
											2017-11-08 14:10:49 +03:00
+								        self.cfg = OrderedDict(sorted(cfg.items()))
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        self.cfg.setdefault("cnn_maxout_pieces", 2)
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
+								    @property
 								    def labels(self):
-												💫  Make handling of [Pipe].labels consistent  (#3273)

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well

											
										
										
											2019-02-14 22:03:19 +03:00
+								        return tuple(self.vocab.morphology.tag_names)
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
-												Update tensorizer component

											
										
										
											2017-11-03 22:20:26 +03:00
+								    @property
 								    def tok2vec(self):
 								        if self.model in (None, True, False):
 								            return None
 								        else:
 								            return chain(self.model.tok2vec, flatten)
-												Remove state argument in pipeline. Other changes

											
										
										
											2017-05-19 21:26:36 +03:00
+								    def __call__(self, doc):
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								        tags, tokvecs = self.predict([doc])
 								        self.set_annotations([doc], tags, tensors=tokvecs)
-												Fix __call__ method

											
										
										
											2017-05-28 16:11:58 +03:00
+								        return doc
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
 								    def pipe(self, stream, batch_size=128, n_threads=-1):
-												Remove cytoolz usage from spaCy

											
										
										
											2018-12-03 04:19:12 +03:00
+								        for docs in util.minibatch(stream, size=batch_size):
-												Restore changes to pipeline.pyx from nn-beam-parser branch

											
										
										
											2017-08-18 23:02:35 +03:00
+								            docs = list(docs)
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								            tag_ids, tokvecs = self.predict(docs)
 								            self.set_annotations(docs, tag_ids, tensors=tokvecs)
-												Remove state argument in pipeline. Other changes

											
										
										
											2017-05-19 21:26:36 +03:00
+								            yield from docs
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												Don't share CNN, to reduce complexities

											
										
										
											2017-09-21 15:59:48 +03:00
+								    def predict(self, docs):
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Fix bug when docs are empty

											
										
										
											2018-06-29 14:44:25 +03:00
+								        if not any(len(doc) for doc in docs):
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
+								            # Handle cases where there are no tokens in any docs.
-												Fix tagger when docs are empty

											
										
										
											2018-06-29 16:13:45 +03:00
+								            n_labels = len(self.labels)
-												Fix tagger when doc is empty

											
										
										
											2018-06-29 17:05:40 +03:00
+								            guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
 								            tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
 								            return guesses, tokvecs
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								        tokvecs = self.model.tok2vec(docs)
 								        scores = self.model.softmax(tokvecs)
-												Fix tensor extending in tagger

											
										
										
											2017-11-03 15:29:36 +03:00
+								        guesses = []
 								        for doc_scores in scores:
 								            doc_guesses = doc_scores.argmax(axis=1)
 								            if not isinstance(doc_guesses, numpy.ndarray):
 								                doc_guesses = doc_guesses.get()
 								            guesses.append(doc_guesses)
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								        return guesses, tokvecs
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								    def set_annotations(self, docs, batch_tag_ids, tensors=None):
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
+								        if isinstance(docs, Doc):
 								            docs = [docs]
 								        cdef Doc doc
 								        cdef int idx = 0
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
+								        cdef Vocab vocab = self.vocab
-												Neaten set_morphology option on Tagger

											
										
										
											2019-03-08 21:16:02 +03:00
+								        assign_morphology = self.cfg.get("set_morphology", True)
-												Predict tags with encoder

											
										
										
											2017-05-08 15:53:45 +03:00
+								        for i, doc in enumerate(docs):
-												Fix tokvecs flattening in pipeline

											
										
										
											2017-05-21 17:05:34 +03:00
+								            doc_tag_ids = batch_tag_ids[i]
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            if hasattr(doc_tag_ids, "get"):
-												Restore changes to pipeline.pyx from nn-beam-parser branch

											
										
										
											2017-08-18 23:02:35 +03:00
+								                doc_tag_ids = doc_tag_ids.get()
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
+								            for j, tag_id in enumerate(doc_tag_ids):
-												Avoid clobbering preset POS tags

											
										
										
											2017-06-04 23:52:42 +03:00
+								                # Don't clobber preset POS tags
-												Add set_morphology cfg option for Tagger

											
										
										
											2019-03-08 21:03:17 +03:00
+								                if doc.c[j].tag == 0:
-												Neaten set_morphology option on Tagger

											
										
										
											2019-03-08 21:16:02 +03:00
+								                    if doc.c[j].pos == 0 and assign_morphology:
-												Add set_morphology cfg option for Tagger

											
										
										
											2019-03-08 21:03:17 +03:00
+								                        # Don't clobber preset lemmas
 								                        lemma = doc.c[j].lemma
 								                        vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
 								                        if lemma != 0 and lemma != doc.c[j].lex.orth:
 								                            doc.c[j].lemma = lemma
-												Refactor morphologizer

											
										
										
											2019-03-10 01:54:59 +03:00
+								                    else:
 								                        doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
-												Predict tags with encoder

											
										
										
											2017-05-08 15:53:45 +03:00
+								                idx += 1
-												Make pipeline work on empty docs

											
										
										
											2018-06-29 20:21:38 +03:00
+								            if tensors is not None and len(tensors):
-												Fix tensorizer on GPU

											
										
										
											2017-11-05 17:34:40 +03:00
+								                if isinstance(doc.tensor, numpy.ndarray) \
 								                and not isinstance(tensors[i], numpy.ndarray):
 								                    doc.extend_tensor(tensors[i].get())
 								                else:
 								                    doc.extend_tensor(tensors[i])
-												Fix indentation error and set Doc.is_tagged correctly

											
										
										
											2018-04-10 17:14:52 +03:00
+								            doc.is_tagged = True
-												Predict tags with encoder

											
										
										
											2017-05-08 15:53:45 +03:00
-												Don't share CNN, to reduce complexities

											
										
										
											2017-09-21 15:59:48 +03:00
+								    def update(self, docs, golds, drop=0., sgd=None, losses=None):
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Track loss in tagger

											
										
										
											2017-08-20 15:42:23 +03:00
+								        if losses is not None and self.name not in losses:
 								            losses[self.name] = 0.
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 13:50:48 +03:00
+								        if not any(len(doc) for doc in docs):
 								            # Handle cases where there are no tokens in any docs.
 								            return
-												Don't share CNN, to reduce complexities

											
										
										
											2017-09-21 15:59:48 +03:00
+								        tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
+								        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
-												Fix tagger training

											
										
										
											2017-09-23 03:58:06 +03:00
+								        bp_tag_scores(d_tag_scores, sgd=sgd)
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
-												Track loss in tagger

											
										
										
											2017-08-20 15:42:23 +03:00
+								        if losses is not None:
 								            losses[self.name] += loss
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								    def rehearse(self, docs, drop=0., sgd=None, losses=None):
 								        """Perform a 'rehearsal' update, where we try to match the output of
 								        an initial model.
 								        """
 								        if self._rehearsal_model is None:
 								            return
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 13:50:48 +03:00
+								        if not any(len(doc) for doc in docs):
 								            # Handle cases where there are no tokens in any docs.
 								            return
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        guesses, backprop = self.model.begin_update(docs, drop=drop)
 								        target = self._rehearsal_model(docs)
 								        gradient = guesses - target
 								        backprop(gradient, sgd=sgd)
 								        if losses is not None:
 								            losses.setdefault(self.name, 0.0)
 								            losses[self.name] += (gradient**2).sum()
-												Redesign training to integrate NN components

* Obsolete .parser, .entity etc names in favour of .pipeline
* Components no longer create models on initialization
* Models created by loading method (from_disk(), from_bytes() etc), or
    .begin_training()
* Add .predict(), .set_annotations() methods in components
* Pass state through pipeline, to allow components to share information
    more flexibly.

											
										
										
											2017-05-16 17:17:30 +03:00
+								    def get_loss(self, docs, golds, scores):
-												Pass tokvecs through as a list, instead of concatenated. Also fix padding

											
										
										
											2017-05-20 21:23:05 +03:00
+								        scores = self.model.ops.flatten(scores)
-												Fix tagger when some tags aren't in Morphology

											
										
										
											2017-11-01 21:27:49 +03:00
+								        tag_index = {tag: i for i, tag in enumerate(self.labels)}
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
+								        cdef int idx = 0
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        correct = numpy.zeros((scores.shape[0],), dtype="i")
-												Remove state argument in pipeline. Other changes

											
										
										
											2017-05-19 21:26:36 +03:00
+								        guesses = scores.argmax(axis=1)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
-												Update draft of parser neural network model

Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.

Outline of the model:

We first predict context-sensitive vectors for each word in the input:

(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4

This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).

The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.

The current context tokens:

* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0

This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).

The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)

This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.

Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:

(exp(score) / Z) - (exp(score) / gZ)

Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.

Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.

											
										
										
											2017-05-13 00:09:15 +03:00
+								        for gold in golds:
 								            for tag in gold.tags:
-												Remove state argument in pipeline. Other changes

											
										
										
											2017-05-19 21:26:36 +03:00
+								                if tag is None:
 								                    correct[idx] = guesses[idx]
-												Fix handling of unknown tags in tagger update

											
										
										
											2018-06-25 23:00:51 +03:00
+								                elif tag in tag_index:
-												Remove state argument in pipeline. Other changes

											
										
										
											2017-05-19 21:26:36 +03:00
+								                    correct[idx] = tag_index[tag]
-												Fix handling of unknown tags in tagger update

											
										
										
											2018-06-25 23:00:51 +03:00
+								                else:
-												Fix handling of unseen labels in tagger

											
										
										
											2018-06-25 23:24:54 +03:00
+								                    correct[idx] = 0
 								                    known_labels[idx] = 0.
-												Update draft of parser neural network model

Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.

Outline of the model:

We first predict context-sensitive vectors for each word in the input:

(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4

This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).

The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.

The current context tokens:

* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0

This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).

The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)

This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.

Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:

(exp(score) / Z) - (exp(score) / gZ)

Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.

Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.

											
										
										
											2017-05-13 00:09:15 +03:00
+								                idx += 1
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        correct = self.model.ops.xp.array(correct, dtype="i")
-												Update draft of parser neural network model

Model is good, but code is messy. Currently requires Chainer, which may cause the build to fail on machines without a GPU.

Outline of the model:

We first predict context-sensitive vectors for each word in the input:

(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(token_width)
>> convolution ** 4

This convolutional layer is shared between the tagger and the parser. This prevents the parser from needing tag features.
To boost the representation, we make a "super tag" with POS, morphology and dependency label. The tagger predicts this
by adding a softmax layer onto the convolutional layer --- so, we're teaching the convolutional layer to give us a
representation that's one affine transform from this informative lexical information. This is obviously good for the
parser (which backprops to the convolutions too).

The parser model makes a state vector by concatenating the vector representations for its context tokens. Current
results suggest few context tokens works well. Maybe this is a bug.

The current context tokens:

* S0, S1, S2: Top three words on the stack
* B0, B1: First two words of the buffer
* S0L1, S0L2: Leftmost and second leftmost children of S0
* S0R1, S0R2: Rightmost and second rightmost children of S0
* S1L1, S1L2, S1R2, S1R, B0L1, B0L2: Likewise for S1 and B0

This makes the state vector quite long: 13*T, where T is the token vector width (128 is working well). Fortunately,
there's a way to structure the computation to save some expense (and make it more GPU friendly).

The parser typically visits 2*N states for a sentence of length N (although it may visit more, if it back-tracks
with a non-monotonic transition). A naive implementation would require 2*N (B, 13*T) @ (13*T, H) matrix multiplications
for a batch of size B. We can instead perform one (B*N, T) @ (T, 13*H) multiplication, to pre-compute the hidden
weights for each positional feature wrt the words in the batch. (Note that our token vectors come from the CNN
-- so we can't play this trick over the vocabulary. That's how Stanford's NN parser works --- and why its model
is so big.)

This pre-computation strategy allows a nice compromise between GPU-friendliness and implementation simplicity.
The CNN and the wide lower layer are computed on the GPU, and then the precomputed hidden weights are moved
to the CPU, before we start the transition-based parsing process. This makes a lot of things much easier.
We don't have to worry about variable-length batch sizes, and we don't have to implement the dynamic oracle
in CUDA to train.

Currently the parser's loss function is multilabel log loss, as the dynamic oracle allows multiple states to
be 0 cost. This is defined as:

(exp(score) / Z) - (exp(score) / gZ)

Where gZ is the sum of the scores assigned to gold classes. I'm very interested in regressing on the cost directly,
but so far this isn't working well.

Machinery is in place for beam-search, which has been working well for the linear model. Beam search should benefit
greatly from the pre-computation trick.

											
										
										
											2017-05-13 00:09:15 +03:00
+								        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
-												Fix tagger

											
										
										
											2018-09-13 15:14:38 +03:00
+								        d_scores *= self.model.ops.asarray(known_labels)
-												Bug fixes to pipeline

											
										
										
											2017-05-18 12:29:51 +03:00
+								        loss = (d_scores**2).sum()
-												Pass tokvecs through as a list, instead of concatenated. Also fix padding

											
										
										
											2017-05-20 21:23:05 +03:00
+								        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
-												Fix use_params and pipe methods

											
										
										
											2017-05-18 16:30:59 +03:00
+								        return float(loss), d_scores
-												Add a pipeline module, to collect and wrap processes for annotation

											
										
										
											2016-10-16 02:47:12 +03:00
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
-												Pass kwargs into pipeline components during begin_training

											
										
										
											2018-02-12 12:18:39 +03:00
+								                       **kwargs):
-												Warn in Tagger.begin_training if no lemma tables are available (#4351)


											
										
										
											2019-10-01 16:13:55 +03:00
+								        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
 								        if not any(table in self.vocab.lookups for table in lemma_tables):
 								            user_warning(Warnings.W022)
-												Fix use_params and pipe methods

											
										
										
											2017-05-18 16:30:59 +03:00
+								        orig_tag_map = dict(self.vocab.morphology.tag_map)
-												Try to fix python3.5 serialization

											
										
										
											2017-11-08 14:10:49 +03:00
+								        new_tag_map = OrderedDict()
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								        for raw_text, annots_brackets in get_gold_tuples():
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								            _ = annots_brackets.pop()
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								            for annots, brackets in annots_brackets:
 								                ids, words, tags, heads, deps, ents = annots
 								                for tag in tags:
-												Fix use_params and pipe methods

											
										
										
											2017-05-18 16:30:59 +03:00
+								                    if tag in orig_tag_map:
 								                        new_tag_map[tag] = orig_tag_map[tag]
 								                    else:
 								                        new_tag_map[tag] = {POS: X}
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
+								        cdef Vocab vocab = self.vocab
-												Move weight serialization to Thinc

											
										
										
											2017-06-01 11:04:36 +03:00
+								        if new_tag_map:
 								            vocab.morphology = Morphology(vocab.strings, new_tag_map,
-												Fix loading of morphology exceptions

											
										
										
											2017-06-05 00:34:32 +03:00
+								                                          vocab.morphology.lemmatizer,
 								                                          exc=vocab.morphology.exc)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
-												More serialization fixes. Still broken

											
										
										
											2017-05-29 21:23:47 +03:00
+								        if self.model is True:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            for hp in ["token_vector_width", "conv_depth"]:
-												Pass tagger options in begin_training

											
										
										
											2018-12-18 02:08:31 +03:00
+								                if hp in kwargs:
 								                    self.cfg[hp] = kwargs[hp]
-												Fix serialization of model options

											
										
										
											2017-09-21 21:07:26 +03:00
+								            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								        link_vectors_to_models(self.vocab)
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								        if sgd is None:
 								            sgd = self.create_optimizer()
 								        return sgd
-												More serialization fixes. Still broken

											
										
										
											2017-05-29 21:23:47 +03:00
 								    @classmethod
-												Fix tagger training

											
										
										
											2017-09-23 03:58:06 +03:00
+								    def Model(cls, n_tags, **cfg):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"):
-												Update errors

											
										
										
											2018-04-03 22:40:29 +03:00
+								            raise ValueError(TempErrors.T008)
-												Fix tagger training

											
										
										
											2017-09-23 03:58:06 +03:00
+								        return build_tagger_model(n_tags, **cfg)
-												Pass option for pretrained vectors in pipeline

											
										
										
											2017-09-16 20:46:02 +03:00
-												Allow Tagger.add_label() before training

											
										
										
											2017-11-01 23:49:24 +03:00
+								    def add_label(self, label, values=None):
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
+								        if label in self.labels:
 								            return 0
-												Allow Tagger.add_label() before training

											
										
										
											2017-11-01 23:49:24 +03:00
+								        if self.model not in (True, False, None):
 								            # Here's how the model resizing will work, once the
 								            # neuron-to-tag mapping is no longer controlled by
 								            # the Morphology class, which sorts the tag names.
 								            # The sorting makes adding labels difficult.
 								            # smaller = self.model._layers[-1]
 								            # larger = Softmax(len(self.labels)+1, smaller.nI)
 								            # copy_array(larger.W[:smaller.nO], smaller.W)
 								            # copy_array(larger.b[:smaller.nO], smaller.b)
 								            # self.model._layers[-1] = larger
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(TempErrors.T003)
-												Allow Tagger.add_label() before training

											
										
										
											2017-11-01 23:49:24 +03:00
+								        tag_map = dict(self.vocab.morphology.tag_map)
 								        if values is None:
 								            values = {POS: "X"}
 								        tag_map[label] = values
 								        self.vocab.morphology = Morphology(
 								            self.vocab.strings, tag_map=tag_map,
 								            lemmatizer=self.vocab.morphology.lemmatizer,
 								            exc=self.vocab.morphology.exc)
 								        return 1
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
-												Fix use_params and pipe methods

											
										
										
											2017-05-18 16:30:59 +03:00
+								    def use_params(self, params):
 								        with self.model.use_params(params):
 								            yield
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_bytes(self, exclude=tuple(), **kwargs):
-												Patch serialization bug raised in #1105

											
										
										
											2017-10-10 04:58:12 +03:00
+								        serialize = OrderedDict()
-												Fix Pipe.to_bytes() when model uninitialized

Closes #3289

											
										
										
											2019-02-21 11:42:02 +03:00
+								        if self.model not in (None, True, False):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            serialize["model"] = self.model.to_bytes
 								        serialize["vocab"] = self.vocab.to_bytes
 								        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
-												Fix serialization

											
										
										
											2017-11-08 15:08:48 +03:00
+								        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
-												Update to/from bytes methods

											
										
										
											2017-05-29 11:14:20 +03:00
+								        return util.to_bytes(serialize, exclude)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
-												More serialization fixes. Still broken

											
										
										
											2017-05-29 21:23:47 +03:00
+								        def load_model(b):
-												Fix loading of multiple pre-trained vectors

This patch addresses #1660, which was caused by keying all pre-trained
vectors with the same ID when telling Thinc how to refer to them. This
meant that if multiple models were loaded that had pre-trained vectors,
errors or incorrect behaviour resulted.

The vectors class now includes a .name attribute, which defaults to:
{nlp.meta['lang']_nlp.meta['name']}.vectors
The vectors name is set in the cfg of the pipeline components under the
key pretrained_vectors. This replaces the previous cfg key
pretrained_dims.

In order to make existing models compatible with this change, we check
for the pretrained_dims key when loading models in from_disk and
from_bytes, and add the cfg key pretrained_vectors if we find it.

											
										
										
											2018-03-28 17:02:59 +03:00
+								            # TODO: Remove this once we don't have to handle previous models
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
 								                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
-												More serialization fixes. Still broken

											
										
										
											2017-05-29 21:23:47 +03:00
+								            if self.model is True:
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								                token_vector_width = util.env_opt(
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                    "token_vector_width",
 								                    self.cfg.get("token_vector_width", 96))
 								                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
-												💫 Improve error message when model.from_bytes() dies (#4014)

* Improve error message when model.from_bytes() dies

When Thinc's model.from_bytes() is called with a mismatched model, often
we get a particularly ungraceful error,

e.g. "AttributeError: FunctionLayer has no attribute G"

This is because we're trying to load the parameters for something like
a LayerNorm layer, and the model architecture has some other layer there
instead. This is obviously terrible, especially since the error *type*
is wrong.

I've changed it to raise a ValueError. The error message is still
probably a bit terse, but it's hard to be sure exactly what's gone
wrong.

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-07-24 12:27:34 +03:00
+								            try:
 								                self.model.from_bytes(b)
 								            except AttributeError:
 								                raise ValueError(Errors.E149)
-												Fix serialization of tag_map in NeuralTagger

											
										
										
											2017-06-02 18:18:37 +03:00
 								        def load_tag_map(b):
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								            tag_map = srsly.msgpack_loads(b)
-												Fix serialization of tag_map in NeuralTagger

											
										
										
											2017-06-02 18:18:37 +03:00
+								            self.vocab.morphology = Morphology(
 								                self.vocab.strings, tag_map=tag_map,
-												Fix loading of morphology exceptions

											
										
										
											2017-06-05 00:34:32 +03:00
+								                lemmatizer=self.vocab.morphology.lemmatizer,
 								                exc=self.vocab.morphology.exc)
-												Pass option for pretrained vectors in pipeline

											
										
										
											2017-09-16 20:46:02 +03:00
-												Serialize in consistent order

											
										
										
											2017-05-30 01:53:06 +03:00
+								        deserialize = OrderedDict((
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            ("vocab", lambda b: self.vocab.from_bytes(b)),
 								            ("tag_map", load_tag_map),
 								            ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
 								            ("model", lambda b: load_model(b)),
-												Serialize in consistent order

											
										
										
											2017-05-30 01:53:06 +03:00
+								        ))
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
-												More serialization fixes. Still broken

											
										
										
											2017-05-29 21:23:47 +03:00
+								        util.from_bytes(bytes_data, deserialize, exclude)
-												Update to/from bytes methods

											
										
										
											2017-05-29 11:14:20 +03:00
+								        return self
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def to_disk(self, path, exclude=tuple(), **kwargs):
-												Fix serialization

											
										
										
											2017-11-08 15:08:48 +03:00
+								        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
+								        serialize = OrderedDict((
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								            ("vocab", lambda p: self.vocab.to_disk(p)),
 								            ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)),
 								            ("model", lambda p: p.open("wb").write(self.model.to_bytes())),
 								            ("cfg", lambda p: srsly.write_json(p, self.cfg))
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
+								        ))
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
-												Work on to/from bytes/disk serialization methods

											
										
										
											2017-05-29 12:45:45 +03:00
+								        util.to_disk(path, serialize, exclude)
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								    def from_disk(self, path, exclude=tuple(), **kwargs):
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
+								        def load_model(p):
-												Fix loading of multiple pre-trained vectors

This patch addresses #1660, which was caused by keying all pre-trained
vectors with the same ID when telling Thinc how to refer to them. This
meant that if multiple models were loaded that had pre-trained vectors,
errors or incorrect behaviour resulted.

The vectors class now includes a .name attribute, which defaults to:
{nlp.meta['lang']_nlp.meta['name']}.vectors
The vectors name is set in the cfg of the pipeline components under the
key pretrained_vectors. This replaces the previous cfg key
pretrained_dims.

In order to make existing models compatible with this change, we check
for the pretrained_dims key when loading models in from_disk and
from_bytes, and add the cfg key pretrained_vectors if we find it.

											
										
										
											2018-03-28 17:02:59 +03:00
+								            # TODO: Remove this once we don't have to handle previous models
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
 								                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
+								            if self.model is True:
-												Fix serialization of model options

											
										
										
											2017-09-21 21:07:26 +03:00
+								                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            with p.open("rb") as file_:
-												💫 Improve error message when model.from_bytes() dies (#4014)

* Improve error message when model.from_bytes() dies

When Thinc's model.from_bytes() is called with a mismatched model, often
we get a particularly ungraceful error,

e.g. "AttributeError: FunctionLayer has no attribute G"

This is because we're trying to load the parameters for something like
a LayerNorm layer, and the model architecture has some other layer there
instead. This is obviously terrible, especially since the error *type*
is wrong.

I've changed it to raise a ValueError. The error message is still
probably a bit terse, but it's hard to be sure exactly what's gone
wrong.

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-07-24 12:27:34 +03:00
+								                try:
 								                    self.model.from_bytes(file_.read())
 								                except AttributeError:
 								                    raise ValueError(Errors.E149)
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
 								        def load_tag_map(p):
-												💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning

											
										
										
											2018-12-03 03:28:22 +03:00
+								            tag_map = srsly.read_msgpack(p)
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
+								            self.vocab.morphology = Morphology(
 								                self.vocab.strings, tag_map=tag_map,
-												Fix loading of morphology exceptions

											
										
										
											2017-06-05 00:34:32 +03:00
+								                lemmatizer=self.vocab.morphology.lemmatizer,
 								                exc=self.vocab.morphology.exc)
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
 								        deserialize = OrderedDict((
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            ("cfg", lambda p: self.cfg.update(_load_cfg(p))),
 								            ("vocab", lambda p: self.vocab.from_disk(p)),
 								            ("tag_map", load_tag_map),
 								            ("model", load_model),
-												Fix serialization for tagger when tag_map has changed

											
										
										
											2017-06-01 20:18:36 +03:00
+								        ))
-												💫 Make serialization methods consistent (#3385)

* Make serialization methods consistent

exclude keyword argument instead of random named keyword arguments and deprecation handling

* Update docs and add section on serialization fields

											
										
										
											2019-03-10 21:16:45 +03:00
+								        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
-												Work on to/from bytes/disk serialization methods

											
										
										
											2017-05-29 12:45:45 +03:00
+								        util.from_disk(path, deserialize, exclude)
 								        return self
-												Update to/from bytes methods

											
										
										
											2017-05-29 11:14:20 +03:00
-												Fix names of pipeline components

NeuralDependencyParser --> DependencyParser
NeuralEntityRecognizer --> EntityRecognizer
TokenVectorEncoder     --> Tensorizer
NeuralLabeller         --> MultitaskObjective

											
										
										
											2017-10-26 13:38:23 +03:00
+								class MultitaskObjective(Tagger):
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								    """Experimental: Assist training of a parser or tagger, by training a
 								    side-objective.
 								    """
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
 								    name = "nn_labeller"
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								        self.vocab = vocab
 								        self.model = model
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if target == "dep":
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								            self.make_label = self.make_dep
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif target == "tag":
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								            self.make_label = self.make_tag
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif target == "ent":
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								            self.make_label = self.make_ent
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif target == "dep_tag_offset":
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								            self.make_label = self.make_dep_tag_offset
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif target == "ent_tag":
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								            self.make_label = self.make_ent_tag
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif target == "sent_start":
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								            self.make_label = self.make_sent_start
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        elif hasattr(target, "__call__"):
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								            self.make_label = target
 								        else:
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								            raise ValueError(Errors.E016)
-												Add cfg attr to pipeline components

											
										
										
											2017-07-23 01:52:47 +03:00
+								        self.cfg = dict(cfg)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        self.cfg.setdefault("cnn_maxout_pieces", 2)
-												Add cfg attr to pipeline components

											
										
										
											2017-07-23 01:52:47 +03:00
 								    @property
 								    def labels(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        return self.cfg.setdefault("labels", {})
-												Add cfg attr to pipeline components

											
										
										
											2017-07-23 01:52:47 +03:00
 								    @labels.setter
 								    def labels(self, value):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        self.cfg["labels"] = value
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								    def set_annotations(self, docs, dep_ids, tensors=None):
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								        pass
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None,
-												Pass kwargs into pipeline components during begin_training

											
										
										
											2018-02-12 12:18:39 +03:00
+								                       sgd=None, **kwargs):
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								        gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								        for raw_text, annots_brackets in gold_tuples:
 								            for annots, brackets in annots_brackets:
 								                ids, words, tags, heads, deps, ents = annots
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								                for i in range(len(ids)):
 								                    label = self.make_label(i, words, tags, heads, deps, ents)
 								                    if label is not None and label not in self.labels:
 								                        self.labels[label] = len(self.labels)
-												More serialization fixes. Still broken

											
										
										
											2017-05-29 21:23:47 +03:00
+								        if self.model is True:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            token_vector_width = util.env_opt("token_vector_width")
-												Fix MultitaskObjective

											
										
										
											2018-01-21 21:21:34 +03:00
+								            self.model = self.Model(len(self.labels), tok2vec=tok2vec)
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								        link_vectors_to_models(self.vocab)
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								        if sgd is None:
 								            sgd = self.create_optimizer()
 								        return sgd
-												More serialization fixes. Still broken

											
										
										
											2017-05-29 21:23:47 +03:00
 								    @classmethod
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								    def Model(cls, n_tags, tok2vec=None, **cfg):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        token_vector_width = util.env_opt("token_vector_width", 96)
-												Improve parser multi-task objective

											
										
										
											2018-12-01 16:41:24 +03:00
+								        softmax = Softmax(n_tags, token_vector_width*2)
-												Fix MultitaskObjective

											
										
										
											2018-01-21 21:21:34 +03:00
+								        model = chain(
 								            tok2vec,
-												Improve parser multi-task objective

											
										
										
											2018-12-01 16:41:24 +03:00
+								            LayerNorm(Maxout(token_vector_width*2, token_vector_width, pieces=3)),
-												Fix MultitaskObjective

											
										
										
											2018-01-21 21:21:34 +03:00
+								            softmax
 								        )
 								        model.tok2vec = tok2vec
 								        model.softmax = softmax
 								        return model
 								    def predict(self, docs):
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Fix MultitaskObjective

											
										
										
											2018-01-21 21:21:34 +03:00
+								        tokvecs = self.model.tok2vec(docs)
 								        scores = self.model.softmax(tokvecs)
 								        return tokvecs, scores
-												Pass option for pretrained vectors in pipeline

											
										
										
											2017-09-16 20:46:02 +03:00
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								    def get_loss(self, docs, golds, scores):
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								        if len(docs) != len(golds):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs),
-												💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None

											
										
										
											2018-04-03 16:50:31 +03:00
+								                                                n_golds=len(golds)))
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								        cdef int idx = 0
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        correct = numpy.zeros((scores.shape[0],), dtype="i")
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								        guesses = scores.argmax(axis=1)
-												Fix multitask objectives

											
										
										
											2018-02-17 20:41:18 +03:00
+								        for i, gold in enumerate(golds):
 								            for j in range(len(docs[i])):
 								                # Handes alignment for tokenization differences
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								                label = self.make_label(j, gold.words, gold.tags,
-												Fix multitask objectives

											
										
										
											2018-02-17 20:41:18 +03:00
+								                                        gold.heads, gold.labels, gold.ents)
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								                if label is None or label not in self.labels:
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								                    correct[idx] = guesses[idx]
 								                else:
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								                    correct[idx] = self.labels[label]
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								                idx += 1
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        correct = self.model.ops.xp.array(correct, dtype="i")
-												Add experimental NeuralLabeller

											
										
										
											2017-05-22 01:52:30 +03:00
+								        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
 								        loss = (d_scores**2).sum()
 								        return float(loss), d_scores
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
+								    @staticmethod
 								    def make_dep(i, words, tags, heads, deps, ents):
 								        if deps[i] is None or heads[i] is None:
 								            return None
 								        return deps[i]
 								    @staticmethod
 								    def make_tag(i, words, tags, heads, deps, ents):
 								        return tags[i]
 								    @staticmethod
 								    def make_ent(i, words, tags, heads, deps, ents):
 								        if ents is None:
 								            return None
 								        return ents[i]
 								    @staticmethod
 								    def make_dep_tag_offset(i, words, tags, heads, deps, ents):
 								        if deps[i] is None or heads[i] is None:
 								            return None
 								        offset = heads[i] - i
 								        offset = min(offset, 2)
 								        offset = max(offset, -2)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        return "%s-%s:%d" % (deps[i], tags[i], offset)
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
 								    @staticmethod
 								    def make_ent_tag(i, words, tags, heads, deps, ents):
 								        if ents is None or ents[i] is None:
 								            return None
 								        else:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            return "%s-%s" % (tags[i], ents[i])
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								    @staticmethod
 								    def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """A multi-task objective for representing sentence boundaries,
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								        using BILU scheme. (O is impossible)
 								        The implementation of this method uses an internal cache that relies
 								        on the identity of the heads array, to avoid requiring a new piece
 								        of gold data. You can pass cache=False if you know the cache will
 								        do the wrong thing.
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        """
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								        assert len(words) == len(heads)
 								        assert target < len(words), (target, len(words))
 								        if cache:
 								            if id(heads) in _cache:
 								                return _cache[id(heads)][target]
 								            else:
 								                for key in list(_cache.keys()):
 								                    _cache.pop(key)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            sent_tags = ["I-SENT"] * len(words)
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								            _cache[id(heads)] = sent_tags
 								        else:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            sent_tags = ["I-SENT"] * len(words)
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
 								        def _find_root(child):
 								            seen = set([child])
 								            while child is not None and heads[child] != child:
 								                seen.add(child)
 								                child = heads[child]
 								            return child
 								        sentences = {}
 								        for i in range(len(words)):
 								            root = _find_root(i)
 								            if root is None:
 								                sent_tags[i] = None
 								            else:
 								                sentences.setdefault(root, []).append(i)
 								        for root, span in sorted(sentences.items()):
 								            if len(span) == 1:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                sent_tags[span[0]] = "U-SENT"
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								            else:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								                sent_tags[span[0]] = "B-SENT"
 								                sent_tags[span[-1]] = "L-SENT"
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								        return sent_tags[target]
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								class ClozeMultitask(Pipe):
 								    @classmethod
 								    def Model(cls, vocab, tok2vec, **cfg):
-												Use chars loss in ClozeMultitask

											
										
										
											2019-10-20 18:47:15 +03:00
+								        if cfg["objective"] == "characters":
-												Fix number characters

											
										
										
											2019-10-24 18:34:16 +03:00
+								            out_sizes = [256] * cfg.get("nr_char", 4)
-												Use chars loss in ClozeMultitask

											
										
										
											2019-10-20 18:47:15 +03:00
+								            output_layer = MultiSoftmax(out_sizes)
 								        else:
 								            output_size = vocab.vectors.data.shape[1]
 								            output_layer = chain(
 								                LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
 								                zero_init(Affine(output_size, output_size, drop_factor=0.0))
 								            )
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        model = chain(tok2vec, output_layer)
 								        model = masked_language_model(vocab, model)
 								        model.tok2vec = tok2vec
 								        model.output_layer = output_layer
 								        return model
 								    def __init__(self, vocab, model=True, **cfg):
 								        self.vocab = vocab
 								        self.model = model
 								        self.cfg = cfg
-												Use chars loss in ClozeMultitask

											
										
										
											2019-10-20 18:47:15 +03:00
+								        self.cfg.setdefault("objective", "characters")
-												Fix number characters

											
										
										
											2019-10-24 18:34:16 +03:00
+								        self.cfg.setdefault("nr_char", 4)
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
 								    def set_annotations(self, docs, dep_ids, tensors=None):
 								        pass
 								    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None,
 								                        tok2vec=None, sgd=None, **kwargs):
 								        link_vectors_to_models(self.vocab)
 								        if self.model is True:
-												Use chars loss in ClozeMultitask

											
										
										
											2019-10-20 18:47:15 +03:00
+								            kwargs.update(self.cfg)
 								            self.model = self.Model(self.vocab, tok2vec, **kwargs)
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        X = self.model.ops.allocate((5, self.model.tok2vec.nO))
 								        self.model.output_layer.begin_training(X)
 								        if sgd is None:
 								            sgd = self.create_optimizer()
 								        return sgd
 								    def predict(self, docs):
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        tokvecs = self.model.tok2vec(docs)
 								        vectors = self.model.output_layer(tokvecs)
 								        return tokvecs, vectors
 								    def get_loss(self, docs, vectors, prediction):
-												Use chars loss in ClozeMultitask

											
										
										
											2019-10-20 18:47:15 +03:00
+								        if self.cfg["objective"] == "characters":
 								            loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
 								        else:
 								            # The simplest way to implement this would be to vstack the
 								            # token.vector values, but that's a bit inefficient, especially on GPU.
 								            # Instead we fetch the index into the vectors table for each of our tokens,
 								            # and look them up all at once. This prevents data copying.
 								            ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
 								            target = vectors[ids]
 								            loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        return float(loss), gradient
-												Fix whitespace

											
										
										
											2019-02-05 14:32:20 +03:00
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								    def update(self, docs, golds, drop=0., sgd=None, losses=None):
 								        pass
 								    def rehearse(self, docs, drop=0., sgd=None, losses=None):
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        if losses is not None and self.name not in losses:
 								            losses[self.name] = 0.
 								        predictions, bp_predictions = self.model.begin_update(docs, drop=drop)
 								        loss, d_predictions = self.get_loss(docs, self.vocab.vectors.data, predictions)
 								        bp_predictions(d_predictions, sgd=sgd)
 								        if losses is not None:
 								            losses[self.name] += loss
-												Add method to decode predicted characters

											
										
										
											2019-10-21 04:56:15 +03:00
+								    @staticmethod
 								    def decode_utf8_predictions(char_array):
 								        # The format alternates filling from start and end, and 255 is missing
 								        words = []
 								        char_array = char_array.reshape((char_array.shape[0], -1, 256))
 								        nr_char = char_array.shape[1]
 								        char_array = char_array.argmax(axis=-1)
 								        for row in char_array:
 								            starts = [chr(c) for c in row[::2] if c != 255]
 								            ends = [chr(c) for c in row[1::2] if c != 255]
 								            word = "".join(starts + list(reversed(ends)))
 								            words.append(word)
 								        return words
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
-												Rename BaseThincComponent --> Pipe

											
										
										
											2017-10-26 13:40:40 +03:00
+								class TextCategorizer(Pipe):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    """Pipeline component for text classification.
 								    DOCS: https://spacy.io/api/textcategorizer
 								    """
-												Fix name of TextCategorizer

											
										
										
											2017-07-22 02:14:07 +03:00
+								    name = 'textcat'
-												Add experimental SimilarityHook omponent

											
										
										
											2017-06-05 16:40:03 +03:00
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								    @classmethod
-												Fix default argument in TextCategorizer.Model (resolves #3221)

											
										
										
											2019-02-05 14:33:47 +03:00
+								    def Model(cls, nr_class=1, **cfg):
-												💫 Make TextCategorizer default to a simpler, GPU-friendly model (#3038)

Currently the TextCategorizer defaults to a fairly complicated model, designed partly around the active learning requirements of Prodigy. The model's a bit slow, and not very GPU-friendly.

This patch implements a straightforward CNN model that still performs pretty well. The replacement model also makes it easy to use the LMAO pretraining, since most of the parameters are in the CNN.

The replacement model has a flag to specify whether labels are mutually exclusive, which defaults to True. This has been a common problem with the text classifier. We'll also now be able to support adding labels to pretrained models again.

Resolves #2934, #2756, #1798, #1748.
											
										
										
											2018-12-10 16:37:39 +03:00
+								        embed_size = util.env_opt("embed_size", 2000)
 								        if "token_vector_width" in cfg:
 								            token_vector_width = cfg["token_vector_width"]
 								        else:
 								            token_vector_width = util.env_opt("token_vector_width", 96)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if cfg.get("architecture") == "simple_cnn":
-												Default to former TextCategorizer model

* Keep TextCategorizer default model same as v2.0
* Add option 'architecture' that allows "simple_cnn" to switch to
simpler model.
* Add option exclusive_classes, defaulting to False. If set to True,
the model treats classes as mutually exclusive, i.e. only one class can
be true per instance.

											
										
										
											2019-02-23 13:55:16 +03:00
+								            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
 								            return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
-												Bug fixes and options for TextCategorizer (#3472)

* Fix code for bag-of-words feature extraction

The _ml.py module had a redundant copy of a function to extract unigram
bag-of-words features, except one had a bug that set values to 0.
Another function allowed extraction of bigram features. Replace all three
with a new function that supports arbitrary ngram sizes and also allows
control of which attribute is used (e.g. ORTH, LOWER, etc).

* Support 'bow' architecture for TextCategorizer

This allows efficient ngram bag-of-words models, which are better when
the classifier needs to run quickly, especially when the texts are long.
Pass architecture="bow" to use it. The extra arguments ngram_size and
attr are also available, e.g. ngram_size=2 means unigram and bigram
features will be extracted.

* Fix size limits in train_textcat example

* Explain architectures better in docs

											
										
										
											2019-03-23 18:44:44 +03:00
+								        elif cfg.get("architecture") == "bow":
 								            return build_bow_text_classifier(nr_class, **cfg)
-												Default to former TextCategorizer model

* Keep TextCategorizer default model same as v2.0
* Add option 'architecture' that allows "simple_cnn" to switch to
simpler model.
* Add option exclusive_classes, defaulting to False. If set to True,
the model treats classes as mutually exclusive, i.e. only one class can
be true per instance.

											
										
										
											2019-02-23 13:55:16 +03:00
+								        else:
 								            return build_text_classifier(nr_class, **cfg)
-												Add experimental SimilarityHook omponent

											
										
										
											2017-06-05 16:40:03 +03:00
-												Fix tensorizer

											
										
										
											2018-11-03 01:51:37 +03:00
+								    @property
 								    def tok2vec(self):
 								        if self.model in (None, True, False):
 								            return None
 								        else:
-												💫 Make TextCategorizer default to a simpler, GPU-friendly model (#3038)

Currently the TextCategorizer defaults to a fairly complicated model, designed partly around the active learning requirements of Prodigy. The model's a bit slow, and not very GPU-friendly.

This patch implements a straightforward CNN model that still performs pretty well. The replacement model also makes it easy to use the LMAO pretraining, since most of the parameters are in the CNN.

The replacement model has a flag to specify whether labels are mutually exclusive, which defaults to True. This has been a common problem with the text classifier. We'll also now be able to support adding labels to pretrained models again.

Resolves #2934, #2756, #1798, #1748.
											
										
										
											2018-12-10 16:37:39 +03:00
+								            return self.model.tok2vec
-												Fix tensorizer

											
										
										
											2018-11-03 01:51:37 +03:00
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								    def __init__(self, vocab, model=True, **cfg):
 								        self.vocab = vocab
 								        self.model = model
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        self._rehearsal_model = None
-												Add cfg attr to pipeline components

											
										
										
											2017-07-23 01:52:47 +03:00
+								        self.cfg = dict(cfg)
-												Five save/load of textcat config

											
										
										
											2017-07-23 01:33:43 +03:00
 								    @property
 								    def labels(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        return tuple(self.cfg.setdefault("labels", []))
-												Five save/load of textcat config

											
										
										
											2017-07-23 01:33:43 +03:00
-												more friendly textcat errors (#3946)

* more friendly textcat errors with require_model and require_labels

* update thinc version with recent bugfix

											
										
										
											2019-07-10 20:39:38 +03:00
+								    def require_labels(self):
 								        """Raise an error if the component's model has no labels defined."""
 								        if not self.labels:
 								            raise ValueError(Errors.E143.format(name=self.name))
-												Five save/load of textcat config

											
										
										
											2017-07-23 01:33:43 +03:00
+								    @labels.setter
 								    def labels(self, value):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        self.cfg["labels"] = tuple(value)
-												Add experimental SimilarityHook omponent

											
										
										
											2017-06-05 16:40:03 +03:00
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								    def pipe(self, stream, batch_size=128, n_threads=-1):
-												Remove cytoolz usage from spaCy

											
										
										
											2018-12-03 04:19:12 +03:00
+								        for docs in util.minibatch(stream, size=batch_size):
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								            docs = list(docs)
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								            scores, tensors = self.predict(docs)
 								            self.set_annotations(docs, scores, tensors=tensors)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								            yield from docs
 								    def predict(self, docs):
-												💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
											
										
										
											2018-12-20 17:54:53 +03:00
+								        self.require_model()
-												Resolve edge case when calling textcat.predict with empty doc (#4035)

* resolve edge case where no doc has tokens when calling textcat.predict

* more explicit value test

											
										
										
											2019-07-30 15:58:01 +03:00
+								        tensors = [doc.tensor for doc in docs]
 								        if not any(len(doc) for doc in docs):
 								            # Handle cases where there are no tokens in any docs.
 								            xp = get_array_module(tensors)
 								            scores = xp.zeros((len(docs), len(self.labels)))
 								            return scores, tensors
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        scores = self.model(docs)
 								        scores = self.model.ops.asarray(scores)
-												tensorizer return parameter fix

											
										
										
											2017-11-05 14:25:10 +03:00
+								        return scores, tensors
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
-												Set Doc.tensor from Tagger

											
										
										
											2017-11-03 13:20:05 +03:00
+								    def set_annotations(self, docs, scores, tensors=None):
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        for i, doc in enumerate(docs):
-												Make gold_tuples arg optional in begin_training

											
										
										
											2017-07-22 21:04:43 +03:00
+								            for j, label in enumerate(self.labels):
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								                doc.cats[label] = float(scores[i, j])
-												Don't share CNN, to reduce complexities

											
										
										
											2017-09-21 15:59:48 +03:00
+								    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
-												more friendly textcat errors (#3946)

* more friendly textcat errors with require_model and require_labels

* update thinc version with recent bugfix

											
										
										
											2019-07-10 20:39:38 +03:00
+								        self.require_model()
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 13:50:48 +03:00
+								        if not any(len(doc) for doc in docs):
 								            # Handle cases where there are no tokens in any docs.
 								            return
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 								        loss, d_scores = self.get_loss(docs, golds, scores)
-												Don't share CNN, to reduce complexities

											
										
										
											2017-09-21 15:59:48 +03:00
+								        bp_scores(d_scores, sgd=sgd)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        if losses is not None:
 								            losses.setdefault(self.name, 0.0)
 								            losses[self.name] += loss
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								    def rehearse(self, docs, drop=0., sgd=None, losses=None):
 								        if self._rehearsal_model is None:
 								            return
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 13:50:48 +03:00
+								        if not any(len(doc) for doc in docs):
 								            # Handle cases where there are no tokens in any docs.
 								            return
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 								        target = self._rehearsal_model(docs)
 								        gradient = scores - target
 								        bp_scores(gradient, sgd=sgd)
 								        if losses is not None:
 								            losses.setdefault(self.name, 0.0)
 								            losses[self.name] += (gradient**2).sum()
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								    def get_loss(self, docs, golds, scores):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
 								        not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        for i, gold in enumerate(golds):
 								            for j, label in enumerate(self.labels):
-												Fix multi-label support for text classification

The TextCategorizer class is supposed to support multi-label
text classification, and allow training data to contain missing
values.

For this to work, the gradient of the loss should be 0 when labels
are missing. Instead, there was no way to actually denote "missing"
in the GoldParse class, and so the TextCategorizer class treated
the label set within gold.cats as complete.

To fix this, we change GoldParse.cats to be a dict instead of a list.
The GoldParse.cats dict should map to floats, with 1. denoting
'present' and 0. denoting 'absent'. Gradients are zeroed for categories
absent from the gold.cats dict. A nice bonus is that you can also set
values between 0 and 1 for partial membership. You can also set numeric
values, if you're using a text classification model that uses an
appropriate loss function.

Unfortunately this is a breaking change; although the functionality
was only recently introduced and hasn't been properly documented
yet. I've updated the example script accordingly.

											
										
										
											2017-10-06 02:43:02 +03:00
+								                if label in gold.cats:
 								                    truths[i, j] = gold.cats[label]
 								                else:
 								                    not_missing[i, j] = 0.
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        truths = self.model.ops.asarray(truths)
-												Fix multi-label support for text classification

The TextCategorizer class is supposed to support multi-label
text classification, and allow training data to contain missing
values.

For this to work, the gradient of the loss should be 0 when labels
are missing. Instead, there was no way to actually denote "missing"
in the GoldParse class, and so the TextCategorizer class treated
the label set within gold.cats as complete.

To fix this, we change GoldParse.cats to be a dict instead of a list.
The GoldParse.cats dict should map to floats, with 1. denoting
'present' and 0. denoting 'absent'. Gradients are zeroed for categories
absent from the gold.cats dict. A nice bonus is that you can also set
values between 0 and 1 for partial membership. You can also set numeric
values, if you're using a text classification model that uses an
appropriate loss function.

Unfortunately this is a breaking change; although the functionality
was only recently introduced and hasn't been properly documented
yet. I've updated the example script accordingly.

											
										
										
											2017-10-06 02:43:02 +03:00
+								        not_missing = self.model.ops.asarray(not_missing)
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
+								        d_scores = (scores-truths) / scores.shape[0]
-												Fix multi-label support for text classification

The TextCategorizer class is supposed to support multi-label
text classification, and allow training data to contain missing
values.

For this to work, the gradient of the loss should be 0 when labels
are missing. Instead, there was no way to actually denote "missing"
in the GoldParse class, and so the TextCategorizer class treated
the label set within gold.cats as complete.

To fix this, we change GoldParse.cats to be a dict instead of a list.
The GoldParse.cats dict should map to floats, with 1. denoting
'present' and 0. denoting 'absent'. Gradients are zeroed for categories
absent from the gold.cats dict. A nice bonus is that you can also set
values between 0 and 1 for partial membership. You can also set numeric
values, if you're using a text classification model that uses an
appropriate loss function.

Unfortunately this is a breaking change; although the functionality
was only recently introduced and hasn't been properly documented
yet. I've updated the example script accordingly.

											
										
										
											2017-10-06 02:43:02 +03:00
+								        d_scores *= not_missing
-												Clean up TextCategorizer slightly

											
										
										
											2019-02-23 14:28:06 +03:00
+								        mean_square_error = (d_scores**2).sum(axis=1).mean()
-												Fix dropout in tensorizer, update comment

											
										
										
											2018-11-03 15:46:58 +03:00
+								        return float(mean_square_error), d_scores
-												Add text-classification hook to pipeline

											
										
										
											2017-07-20 01:18:15 +03:00
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
+								    def add_label(self, label):
 								        if label in self.labels:
 								            return 0
-												Fix add_label methods

											
										
										
											2017-11-01 19:06:43 +03:00
+								        if self.model not in (None, True, False):
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
+								            # This functionality was available previously, but was broken.
 								            # The problem is that we resize the last layer, but the last layer
 								            # is actually just an ensemble. We're not resizing the child layers
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            # - a huge problem.
-												💫  Make handling of [Pipe].labels consistent  (#3273)

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well

											
										
										
											2019-02-14 22:03:19 +03:00
+								            raise ValueError(Errors.E116)
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            # smaller = self.model._layers[-1]
 								            # larger = Affine(len(self.labels)+1, smaller.nI)
 								            # copy_array(larger.W[:smaller.nO], smaller.W)
 								            # copy_array(larger.b[:smaller.nO], smaller.b)
 								            # self.model._layers[-1] = larger
-												💫  Make handling of [Pipe].labels consistent  (#3273)

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well

											
										
										
											2019-02-14 22:03:19 +03:00
+								        self.labels = tuple(list(self.labels) + [label])
-												Add add_label methods to Tagger and TextCategorizer

											
										
										
											2017-11-01 18:32:44 +03:00
+								        return 1
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
-												Fix get labels for textcat

											
										
										
											2019-10-07 17:50:15 +03:00
+								        for raw_text, (_, (cats, _2)) in get_gold_tuples():
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 23:31:31 +03:00
+								            for cat in cats:
 								                self.add_label(cat)
-												Add experimental SimilarityHook omponent

											
										
										
											2017-06-05 16:40:03 +03:00
+								        if self.model is True:
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								            self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
-												more friendly textcat errors (#3946)

* more friendly textcat errors with require_model and require_labels

* update thinc version with recent bugfix

											
										
										
											2019-07-10 20:39:38 +03:00
+								            self.require_labels()
-												Fix textcat after merge

											
										
										
											2018-04-29 16:48:53 +03:00
+								            self.model = self.Model(len(self.labels), **self.cfg)
-												Add link_vectors_to_models function

											
										
										
											2017-09-22 17:38:22 +03:00
+								            link_vectors_to_models(self.vocab)
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								        if sgd is None:
 								            sgd = self.create_optimizer()
 								        return sgd
-												Add experimental SimilarityHook omponent

											
										
										
											2017-06-05 16:40:03 +03:00
-												Fix names of pipeline components

NeuralDependencyParser --> DependencyParser
NeuralEntityRecognizer --> EntityRecognizer
TokenVectorEncoder     --> Tensorizer
NeuralLabeller         --> MultitaskObjective

											
										
										
											2017-10-26 13:38:23 +03:00
+								cdef class DependencyParser(Parser):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    """Pipeline component for dependency parsing.
 								    DOCS: https://spacy.io/api/dependencyparser
 								    """
 								    name = "parser"
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
+								    TransitionSystem = ArcEager
-												Fix number characters

											
										
										
											2019-10-24 18:34:16 +03:00
+								    nr_feature = 8
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
-												Trigger nonproj.deprojectivize as a postprocess

											
										
										
											2017-10-07 03:00:47 +03:00
+								    @property
 								    def postprocesses(self):
-												Fix #3830: 'subtok' label being added even if learn_tokens=False (#4188)

* Prevent subtok label if not learning tokens

The parser introduces the subtok label to mark tokens that should be
merged during post-processing. Previously this happened even if we did
not have the --learn-tokens flag set. This patch passes the config
through to the parser, to prevent the problem.

* Make merge_subtokens a parser post-process if learn_subtokens

* Fix train script

* Add test for 3830: subtok problem

* Fix handlign of non-subtok in parser training

											
										
										
											2019-08-23 18:54:00 +03:00
+								        output = [nonproj.deprojectivize]
 								        if self.cfg.get("learn_tokens") is True:
 								            output.append(merge_subtokens)
 								        return tuple(output)
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
-												Allow multitask objectives to be added to the parser and NER more easily

											
										
										
											2018-01-21 21:37:02 +03:00
+								    def add_multitask_objective(self, target):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        if target == "cloze":
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								            cloze = ClozeMultitask(self.vocab)
 								            self._multitasks.append(cloze)
 								        else:
 								            labeller = MultitaskObjective(self.vocab, target=target)
 								            self._multitasks.append(labeller)
-												Trigger nonproj.deprojectivize as a postprocess

											
										
										
											2017-10-07 03:00:47 +03:00
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								    def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
-												Allow multitask objectives to be added to the parser and NER more easily

											
										
										
											2018-01-21 21:37:02 +03:00
+								        for labeller in self._multitasks:
-												Fix multi-task objective for parser

											
										
										
											2018-09-13 15:08:55 +03:00
+								            tok2vec = self.model.tok2vec
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								            labeller.begin_training(get_gold_tuples, pipeline=pipeline,
-												Return optimizer from begin_training, creating if necessary

											
										
										
											2017-11-06 16:26:26 +03:00
+								                                    tok2vec=tok2vec, sgd=sgd)
-												Allow multi-task objectives during training

											
										
										
											2017-09-26 13:42:52 +03:00
-												Add __reduce__ methods on parser subclasses. Fixes pickling.

											
										
										
											2017-05-27 23:46:06 +03:00
+								    def __reduce__(self):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								        return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
-												Add __reduce__ methods on parser subclasses. Fixes pickling.

											
										
										
											2017-05-27 23:46:06 +03:00
-												💫  Make handling of [Pipe].labels consistent  (#3273)

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well

											
										
										
											2019-02-14 22:03:19 +03:00
+								    @property
 								    def labels(self):
-												Improve label properties on pipes

											
										
										
											2019-09-12 19:02:44 +03:00
+								        labels = set()
-												💫  Make handling of [Pipe].labels consistent  (#3273)

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well

											
										
										
											2019-02-14 22:03:19 +03:00
+								        # Get the labels from the model by looking at the available moves
-												Improve label properties on pipes

											
										
										
											2019-09-12 19:02:44 +03:00
+								        for move in self.move_names:
 								            if "-" in move:
 								                label = move.split("-")[1]
 								                if "||" in label:
 								                    label = label.split("||")[1]
 								                labels.add(label)
 								        return tuple(sorted(labels))
-												💫  Make handling of [Pipe].labels consistent  (#3273)

* Make handling of [Pipe].labels consistent

* Un-xfail passing test

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Update spacy/tests/pipeline/test_pipe_methods.py

Co-Authored-By: ines <ines@ines.io>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: ines <ines@ines.io>

* Move error message to spacy.errors

* Fix textcat labels and test

* Make EntityRuler.labels return tuple as well

											
										
										
											2019-02-14 22:03:19 +03:00
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
-												Fix names of pipeline components

NeuralDependencyParser --> DependencyParser
NeuralEntityRecognizer --> EntityRecognizer
TokenVectorEncoder     --> Tensorizer
NeuralLabeller         --> MultitaskObjective

											
										
										
											2017-10-26 13:38:23 +03:00
+								cdef class EntityRecognizer(Parser):
-												Tidy up and improve docs and docstrings (#3370)

<!--- Provide a general summary of your changes in the title. -->

## Description
* tidy up and adjust Cython code to code style
* improve docstrings and make calling `help()` nicer
* add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects
* fix various typos and inconsistencies in docs

### Types of change
enhancement, docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2019-03-08 13:42:26 +03:00
+								    """Pipeline component for named entity recognition.
 								    DOCS: https://spacy.io/api/entityrecognizer
 								    """
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								    name = "ner"
-												Get data flowing through pipeline. Needs redesign

											
										
										
											2017-05-16 12:21:59 +03:00
+								    TransitionSystem = BiluoPushDown
-												Try 3 NER features

											
										
										
											2019-10-07 17:51:03 +03:00
+								    nr_feature = 3
-												Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2dc7067cf8bd55f878cec45a8c6d73d4, reversing
changes made to 92c26a35d425d4e8ca1b805ea776ea10f5ded3df.

											
										
										
											2018-03-27 20:23:02 +03:00
-												Allow multitask objectives to be added to the parser and NER more easily

											
										
										
											2018-01-21 21:37:02 +03:00
+								    def add_multitask_objective(self, target):
-												💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx

* Merge some components back together

* Fix typo

											
										
										
											2019-02-10 14:14:51 +03:00
+								        if target == "cloze":
-												💫 Better support for semi-supervised learning (#3035)

The new spacy pretrain command implemented BERT/ULMFit/etc-like transfer learning, using our Language Modelling with Approximate Outputs version of BERT's cloze task. Pretraining is convenient, but in some ways it's a bit of a strange solution. All we're doing is initialising the weights. At the same time, we're putting a lot of work into our optimisation so that it's less sensitive to initial conditions, and more likely to find good optima. I discuss this a bit in the pseudo-rehearsal blog post: https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
Support semi-supervised learning in spacy train

One obvious way to improve these pretraining methods is to do multi-task learning, instead of just transfer learning. This has been shown to work very well: https://arxiv.org/pdf/1809.08370.pdf . This patch makes it easy to do this sort of thing.

    Add a new argument to spacy train, --raw-text. This takes a jsonl file with unlabelled data that can be used in arbitrary ways to do semi-supervised learning.

    Add a new method to the Language class and to pipeline components, .rehearse(). This is like .update(), but doesn't expect GoldParse objects. It takes a batch of Doc objects, and performs an update on some semi-supervised objective.

    Move the BERT-LMAO objective out from spacy/cli/pretrain.py into spacy/_ml.py, so we can create a new pipeline component, ClozeMultitask. This can be specified as a parser or NER multitask in the spacy train command. Example usage:

python -m spacy train en ./tmp ~/data/en-core-web/train/nw.json ~/data/en-core-web/dev/nw.json --pipeline parser --raw-textt ~/data/unlabelled/reddit-100k.jsonl --vectors en_vectors_web_lg --parser-multitasks cloze

Implement rehearsal methods for pipeline components

The new --raw-text argument and nlp.rehearse() method also gives us a good place to implement the the idea in the pseudo-rehearsal blog post in the parser. This works as follows:

    Add a new nlp.resume_training() method. This allocates copies of pre-trained models in the pipeline, setting things up for the rehearsal updates. It also returns an optimizer object. This also greatly reduces confusion around the nlp.begin_training() method, which randomises the weights, making it not suitable for adding new labels or otherwise fine-tuning a pre-trained model.

    Implement rehearsal updates on the Parser class, making it available for the dependency parser and NER. During rehearsal, the initial model is used to supervise the model being trained. The current model is asked to match the predictions of the initial model on some data. This minimises catastrophic forgetting, by keeping the model's predictions close to the original. See the blog post for details.

    Implement rehearsal updates for tagger

    Implement rehearsal updates for text categoriz
											
										
										
											2018-12-10 18:25:33 +03:00
+								            cloze = ClozeMultitask(self.vocab)
 								            self._multitasks.append(cloze)
 								        else:
 								            labeller = MultitaskObjective(self.vocab, target=target)
 								            self._multitasks.append(labeller)
-												Get spaCy train command working with neural network

* Integrate models into pipeline
* Add basic serialization (maybe incorrect)
* Fix pickle on vocab

											
										
										
											2017-05-17 13:04:50 +03:00
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								    def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
-												Allow multitask objectives to be added to the parser and NER more easily

											
										
										
											2018-01-21 21:37:02 +03:00
+								        for labeller in self._multitasks:
-												Fix multi-task objective for parser

											
										
										
											2018-09-13 15:08:55 +03:00
+								            tok2vec = self.model.tok2vec
-												Pass data as a function in begin_training methods

											
										
										
											2018-03-27 12:39:59 +03:00
+								            labeller.begin_training(get_gold_tuples, pipeline=pipeline,
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								                                    tok2vec=tok2vec)
-												Restore changes to pipeline.pyx from nn-beam-parser branch

											
										
										
											2017-08-18 23:02:35 +03:00
-												Add __reduce__ methods on parser subclasses. Fixes pickling.

											
										
										
											2017-05-27 23:46:06 +03:00
+								    def __reduce__(self):
-												Tidy up pipeline

											
										
										
											2017-10-27 21:29:08 +03:00
+								        return (EntityRecognizer, (self.vocab, self.moves, self.model),
 								                None, None)
-												Trigger nonproj.deprojectivize as a postprocess

											
										
										
											2017-10-07 03:00:47 +03:00
-												Add EntityRecognizer.label property

											
										
										
											2018-11-18 02:06:26 +03:00
+								    @property
 								    def labels(self):
 								        # Get the labels from the model by looking at the available moves, e.g.
 								        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-												Improve label properties on pipes

											
										
										
											2019-09-12 19:02:44 +03:00
+								        labels = set(move.split("-")[1] for move in self.move_names
 								                     if move[0] in ("B", "I", "L", "U"))
 								        return tuple(sorted(labels))
-												Add EntityRecognizer.label property

											
										
										
											2018-11-18 02:06:26 +03:00
-												Add beam-search classes

											
										
										
											2017-03-15 17:27:41 +03:00
-												adding kb_id as field to token, el as nlp pipeline component

											
										
										
											2019-03-06 21:34:18 +03:00
+								class EntityLinker(Pipe):
-												code cleanup

											
										
										
											2019-06-06 21:22:14 +03:00
+								    """Pipeline component for named entity linking.
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								    DOCS: https://spacy.io/api/entitylinker
-												code cleanup

											
										
										
											2019-06-06 21:22:14 +03:00
+								    """
-												'entity_linker' instead of 'el'

											
										
										
											2019-03-22 15:55:10 +03:00
+								    name = 'entity_linker'
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								    NIL = "NIL"  # string used to refer to a non-existing link
-												adding kb_id as field to token, el as nlp pipeline component

											
										
										
											2019-03-06 21:34:18 +03:00
 								    @classmethod
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
+								    def Model(cls, **cfg):
 								        embed_width = cfg.get("embed_width", 300)
-												clean up code, remove old code, move to bin

											
										
										
											2019-06-18 14:20:40 +03:00
+								        hidden_width = cfg.get("hidden_width", 128)
-												experiment with adding NER types to the feature vector

											
										
										
											2019-06-29 15:52:36 +03:00
+								        type_to_int = cfg.get("type_to_int", dict())
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
-												experiment with adding NER types to the feature vector

											
										
										
											2019-06-29 15:52:36 +03:00
+								        model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        return model
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
-												deuglify kb deserializer

											
										
										
											2019-07-03 16:00:42 +03:00
+								    def __init__(self, vocab, **cfg):
 								        self.vocab = vocab
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        self.model = True
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        self.kb = None
-												adding kb_id as field to token, el as nlp pipeline component

											
										
										
											2019-03-06 21:34:18 +03:00
+								        self.cfg = dict(cfg)
-												eval on dev set, varying combo's of prior and context scores

											
										
										
											2019-06-11 12:40:58 +03:00
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								    def set_kb(self, kb):
 								        self.kb = kb
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
 								    def require_model(self):
-												eval on dev set, varying combo's of prior and context scores

											
										
										
											2019-06-11 12:40:58 +03:00
+								        # Raise an error if the component's model is not initialized.
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        if getattr(self, "model", None) in (None, True, False):
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
+								            raise ValueError(Errors.E109.format(name=self.name))
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								    def require_kb(self):
 								        # Raise an error if the knowledge base is not initialized.
 								        if getattr(self, "kb", None) in (None, True, False):
-												custom error and warning messages

											
										
										
											2019-06-19 13:35:26 +03:00
+								            raise ValueError(Errors.E139.format(name=self.name))
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
+								    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        self.require_kb()
 								        self.cfg["entity_width"] = self.kb.entity_vector_length
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        if self.model is True:
 								            self.model = self.Model(**self.cfg)
 								        if sgd is None:
 								            sgd = self.create_optimizer()
-												fix for context encoder optimizer

											
										
										
											2019-07-03 14:35:36 +03:00
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        return sgd
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
+								    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
 								        self.require_model()
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        self.require_kb()
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
-												redo training data to be independent of KB and entity-level instead of doc-level

											
										
										
											2019-06-14 16:55:26 +03:00
+								        if losses is not None:
 								            losses.setdefault(self.name, 0.0)
 								        if not docs or not golds:
 								            return 0
-												storing NEL training data in GoldParse objects

											
										
										
											2019-06-07 13:58:42 +03:00
+								        if len(docs) != len(golds):
-												introduce goldparse.links

											
										
										
											2019-06-07 14:54:45 +03:00
+								            raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
-												storing NEL training data in GoldParse objects

											
										
										
											2019-06-07 13:58:42 +03:00
+								                                                n_golds=len(golds)))
-												introduce goldparse.links

											
										
										
											2019-06-07 14:54:45 +03:00
+								        if isinstance(docs, Doc):
 								            docs = [docs]
 								            golds = [golds]
-												small fixes

											
										
										
											2019-06-24 11:55:04 +03:00
+								        context_docs = []
-												fix for context encoder optimizer

											
										
										
											2019-07-03 14:35:36 +03:00
-												introduce goldparse.links

											
										
										
											2019-06-07 14:54:45 +03:00
+								        for doc, gold in zip(docs, golds):
-												experiment with adding NER types to the feature vector

											
										
										
											2019-06-29 15:52:36 +03:00
+								            ents_by_offset = dict()
 								            for ent in doc.ents:
-												format and bugfix

											
										
										
											2019-07-22 16:08:17 +03:00
+								                ents_by_offset["{}_{}".format(ent.start_char, ent.end_char)] = ent
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								            for entity, kb_dict in gold.links.items():
 								                start, end = entity
-												redo training data to be independent of KB and entity-level instead of doc-level

											
										
										
											2019-06-14 16:55:26 +03:00
+								                mention = doc.text[start:end]
-												replace assert's with custom error messages

											
										
										
											2019-07-23 12:52:48 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								                for kb_id, value in kb_dict.items():
 								                    # Currently only training on the positive instances
 								                    if value:
 								                        context_docs.append(doc)
-												filter training data beforehand (+black formatting)

											
										
										
											2019-07-18 11:22:24 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								        context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
 								        loss, d_scores = self.get_similarity_loss(scores=context_encodings, golds=golds, docs=None)
 								        bp_context(d_scores, sgd=sgd)
-												adding prior probability as feature in the model

											
										
										
											2019-06-28 17:22:58 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								        if losses is not None:
 								            losses[self.name] += loss
 								        return loss
-												training loop in proper pipe format

											
										
										
											2019-06-07 16:55:10 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								    def get_similarity_loss(self, docs, golds, scores):
 								        entity_encodings = []
 								        for gold in golds:
 								            for entity, kb_dict in gold.links.items():
 								                for kb_id, value in kb_dict.items():
 								                    # this loss function assumes we're only using positive examples
 								                    if value:
 								                        entity_encoding = self.kb.get_vector(kb_id)
 								                        entity_encodings.append(entity_encoding)
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								        entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								        if scores.shape != entity_encodings.shape:
 								            raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
-												training loop in proper pipe format

											
										
										
											2019-06-07 16:55:10 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								        loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings)
 								        loss = loss / len(entity_encodings)
 								        return loss, gradients
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
-												code cleanup

											
										
										
											2019-07-15 18:36:43 +03:00
+								    def get_loss(self, docs, golds, scores):
-												use original gold object in get_loss function

											
										
										
											2019-07-18 14:35:10 +03:00
+								        cats = []
 								        for gold in golds:
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								            for entity, kb_dict in gold.links.items():
 								                for kb_id, value in kb_dict.items():
 								                    cats.append([value])
-												use original gold object in get_loss function

											
										
										
											2019-07-18 14:35:10 +03:00
 								        cats = self.model.ops.asarray(cats, dtype="float32")
-												replace assert's with custom error messages

											
										
										
											2019-07-23 12:52:48 +03:00
+								        if len(scores) != len(cats):
 								            raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
-												use original gold object in get_loss function

											
										
										
											2019-07-18 14:35:10 +03:00
 								        d_scores = (scores - cats)
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
+								        loss = (d_scores ** 2).sum()
-												use original gold object in get_loss function

											
										
										
											2019-07-18 14:35:10 +03:00
+								        loss = loss / len(cats)
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
+								        return loss, d_scores
-												adding kb_id as field to token, el as nlp pipeline component

											
										
										
											2019-03-06 21:34:18 +03:00
+								    def __call__(self, doc):
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								        kb_ids, tensors = self.predict([doc])
 								        self.set_annotations([doc], kb_ids, tensors=tensors)
-												adding kb_id as field to token, el as nlp pipeline component

											
										
										
											2019-03-06 21:34:18 +03:00
+								        return doc
 								    def pipe(self, stream, batch_size=128, n_threads=-1):
 								        for docs in util.minibatch(stream, size=batch_size):
 								            docs = list(docs)
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								            kb_ids, tensors = self.predict(docs)
 								            self.set_annotations(docs, kb_ids, tensors=tensors)
-												adding kb_id as field to token, el as nlp pipeline component

											
										
										
											2019-03-06 21:34:18 +03:00
+								            yield from docs
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
+								    def predict(self, docs):
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								        """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
+								        self.require_model()
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        self.require_kb()
-												speeding up training

											
										
										
											2019-06-12 14:37:05 +03:00
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								        entity_count = 0
-												small fixes

											
										
										
											2019-06-24 11:55:04 +03:00
+								        final_kb_ids = []
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								        final_tensors = []
-												speeding up training

											
										
										
											2019-06-12 14:37:05 +03:00
-												redo training data to be independent of KB and entity-level instead of doc-level

											
										
										
											2019-06-14 16:55:26 +03:00
+								        if not docs:
-												return fix

											
										
										
											2019-07-23 15:23:58 +03:00
+								            return final_kb_ids, final_tensors
-												redo training data to be independent of KB and entity-level instead of doc-level

											
										
										
											2019-06-14 16:55:26 +03:00
 								        if isinstance(docs, Doc):
 								            docs = [docs]
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								        context_encodings = self.model(docs)
-												improve speed of prediction loop

											
										
										
											2019-06-26 14:53:10 +03:00
+								        xp = get_array_module(context_encodings)
-												redo training data to be independent of KB and entity-level instead of doc-level

											
										
										
											2019-06-14 16:55:26 +03:00
+								        for i, doc in enumerate(docs):
 								            if len(doc) > 0:
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								                # currently, the context is the same for each entity in a sentence (should be refined)
-												improve speed of prediction loop

											
										
										
											2019-06-26 14:53:10 +03:00
+								                context_encoding = context_encodings[i]
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								                context_enc_t = context_encoding.T
 								                norm_1 = xp.linalg.norm(context_enc_t)
-												redo training data to be independent of KB and entity-level instead of doc-level

											
										
										
											2019-06-14 16:55:26 +03:00
+								                for ent in doc.ents:
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								                    entity_count += 1
-												experiment with adding NER types to the feature vector

											
										
										
											2019-06-29 15:52:36 +03:00
-												small tweaks and documentation

											
										
										
											2019-06-18 19:38:09 +03:00
+								                    candidates = self.kb.get_candidates(ent.text)
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								                    if not candidates:
 								                        final_kb_ids.append(self.NIL)  # no prediction possible for this entity
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								                        final_tensors.append(context_encoding)
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								                    else:
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
+								                        random.shuffle(candidates)
-												adding prior probability as feature in the model

											
										
										
											2019-06-28 17:22:58 +03:00
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								                        # this will set all prior probabilities to 0 if they should be excluded from the model
 								                        prior_probs = xp.asarray([c.prior_prob for c in candidates])
 								                        if not self.cfg.get("incl_prior", True):
-												dim bugfix when incl_prior is False (#4285)


											
										
										
											2019-09-13 17:30:05 +03:00
+								                            prior_probs = xp.asarray([0.0 for c in candidates])
-												adding prior probability as feature in the model

											
										
										
											2019-06-28 17:22:58 +03:00
+								                        scores = prior_probs
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								                        # add in similarity from the context
 								                        if self.cfg.get("incl_context", True):
-												adding prior probability as feature in the model

											
										
										
											2019-06-28 17:22:58 +03:00
+								                            entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								                            norm_2 = xp.linalg.norm(entity_encodings, axis=1)
-												replace assert's with custom error messages

											
										
										
											2019-07-23 12:52:48 +03:00
+								                            if len(entity_encodings) != len(prior_probs):
 								                                raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length"))
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								                             # cosine similarity
 								                            sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
-												dim bugfix when incl_prior is False (#4285)


											
										
										
											2019-09-13 17:30:05 +03:00
+								                            if sims.shape != prior_probs.shape:
 								                                raise ValueError(Errors.E161)
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								                            scores = prior_probs + sims - (prior_probs*sims)
-												small tweaks and documentation

											
										
										
											2019-06-18 19:38:09 +03:00
 								                        # TODO: thresholding
-												context encoder with Tok2Vec + linking model instead of cosine

											
										
										
											2019-06-28 09:29:31 +03:00
+								                        best_index = scores.argmax()
-												small tweaks and documentation

											
										
										
											2019-06-18 19:38:09 +03:00
+								                        best_candidate = candidates[best_index]
 								                        final_kb_ids.append(best_candidate.entity_)
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								                        final_tensors.append(context_encoding)
-												separate entity encoder to get 64D descriptions

											
										
										
											2019-06-05 01:09:46 +03:00
-												replace assert's with custom error messages

											
										
										
											2019-07-23 12:52:48 +03:00
+								        if not (len(final_tensors) == len(final_kb_ids) == entity_count):
 								            raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
-												implementing el pipe in pipes.pyx (not tested yet)

											
										
										
											2019-06-03 22:32:54 +03:00
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								        return final_kb_ids, final_tensors
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
 								    def set_annotations(self, docs, kb_ids, tensors=None):
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
+								        count_ents = len([ent for doc in docs for ent in doc.ents])
-												replace assert's with custom error messages

											
										
										
											2019-07-23 12:52:48 +03:00
+								        if count_ents != len(kb_ids):
 								            raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
-												output tensors as part of predict

											
										
										
											2019-07-19 15:47:36 +03:00
-												have gold.links correspond exactly to doc.ents

											
										
										
											2019-07-19 13:36:15 +03:00
+								        i=0
 								        for doc in docs:
 								            for ent in doc.ents:
 								                kb_id = kb_ids[i]
 								                i += 1
 								                for token in ent:
 								                    token.ent_kb_id_ = kb_id
-												adding kb_id as field to token, el as nlp pipeline component

											
										
										
											2019-03-06 21:34:18 +03:00
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								    def to_disk(self, path, exclude=tuple(), **kwargs):
 								        serialize = OrderedDict()
 								        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-												deuglify kb deserializer

											
										
										
											2019-07-03 16:00:42 +03:00
+								        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        serialize["kb"] = lambda p: self.kb.dump(p)
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        if self.model not in (None, True, False):
 								            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
 								        util.to_disk(path, serialize, exclude)
 								    def from_disk(self, path, exclude=tuple(), **kwargs):
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        def load_model(p):
-												💫 Improve error message when model.from_bytes() dies (#4014)

* Improve error message when model.from_bytes() dies

When Thinc's model.from_bytes() is called with a mismatched model, often
we get a particularly ungraceful error,

e.g. "AttributeError: FunctionLayer has no attribute G"

This is because we're trying to load the parameters for something like
a LayerNorm layer, and the model architecture has some other layer there
instead. This is obviously terrible, especially since the error *type*
is wrong.

I've changed it to raise a ValueError. The error message is still
probably a bit terse, but it's hard to be sure exactly what's gone
wrong.

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-07-24 12:27:34 +03:00
+								            if self.model is True:
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								                self.model = self.Model(**self.cfg)
-												CLI scripts for entity linking (wikipedia & generic) (#4091)

* document token ent_kb_id

* document span kb_id

* update pipeline documentation

* prior and context weights as bool's instead

* entitylinker api documentation

* drop for both models

* finish entitylinker documentation

* small fixes

* documentation for KB

* candidate documentation

* links to api pages in code

* small fix

* frequency examples as counts for consistency

* consistent documentation about tensors returned by predict

* add entity linking to usage 101

* add entity linking infobox and KB section to 101

* entity-linking in linguistic features

* small typo corrections

* training example and docs for entity_linker

* predefined nlp and kb

* revert back to similarity encodings for simplicity (for now)

* set prior probabilities to 0 when excluded

* code clean up

* bugfix: deleting kb ID from tokens when entities were removed

* refactor train el example to use either model or vocab

* pretrain_kb example for example kb generation

* add to training docs for KB + EL example scripts

* small fixes

* error numbering

* ensure the language of vocab and nlp stay consistent across serialization

* equality with =

* avoid conflict in errors file

* add error 151

* final adjustements to the train scripts - consistency

* update of goldparse documentation

* small corrections

* push commit

* turn kb_creator into CLI script (wip)

* proper parameters for training entity vectors

* wikidata pipeline split up into two executable scripts

* remove context_width

* move wikidata scripts in bin directory, remove old dummy script

* refine KB script with logs and preprocessing options

* small edits

* small improvements to logging of EL CLI script

											
										
										
											2019-08-13 16:38:59 +03:00
+								            try:
-												💫 Improve error message when model.from_bytes() dies (#4014)

* Improve error message when model.from_bytes() dies

When Thinc's model.from_bytes() is called with a mismatched model, often
we get a particularly ungraceful error,

e.g. "AttributeError: FunctionLayer has no attribute G"

This is because we're trying to load the parameters for something like
a LayerNorm layer, and the model architecture has some other layer there
instead. This is obviously terrible, especially since the error *type*
is wrong.

I've changed it to raise a ValueError. The error message is still
probably a bit terse, but it's hard to be sure exactly what's gone
wrong.

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/pipeline/pipes.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/syntax/nn_parser.pyx

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>

* Update spacy/pipeline/pipes.pyx

Co-Authored-By: Matthew Honnibal <honnibal+gh@gmail.com>


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-07-24 12:27:34 +03:00
+								                self.model.from_bytes(p.open("rb").read())
 								            except AttributeError:
 								                raise ValueError(Errors.E149)
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
-												deuglify kb deserializer

											
										
										
											2019-07-03 16:00:42 +03:00
+								        def load_kb(p):
 								            kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"])
 								            kb.load_bulk(p)
 								            self.set_kb(kb)
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        deserialize = OrderedDict()
 								        deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p))
-												deuglify kb deserializer

											
										
										
											2019-07-03 16:00:42 +03:00
+								        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
 								        deserialize["kb"] = load_kb
-												sentence encoder only (removing article/mention encoder)

											
										
										
											2019-06-18 01:05:47 +03:00
+								        deserialize["model"] = load_model
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
+								        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
 								        util.from_disk(path, deserialize, exclude)
 								        return self
 								    def rehearse(self, docs, sgd=None, losses=None, **config):
-												small fixes

											
										
										
											2019-06-24 11:55:04 +03:00
+								        raise NotImplementedError
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
 								    def add_label(self, label):
-												small fixes

											
										
										
											2019-06-24 11:55:04 +03:00
+								        raise NotImplementedError
-												write entity linking pipe to file and keep vocab consistent between kb and nlp

											
										
										
											2019-06-13 17:25:39 +03:00
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
+								class Sentencizer(object):
 								    """Segment the Doc into sentences using a rule-based strategy.
 								    DOCS: https://spacy.io/api/sentencizer
 								    """
 								    name = "sentencizer"
-												Extend default punct for sentencizer (#4290)

Most of these characters are for languages / writing systems that aren't
supported by spacy, but I don't think it causes problems to include
them. In the UD evals, Hindi and Urdu improve a lot as expected (from
0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil
improves in combination with #4288.

The punctuation list is converted to a set internally because of its
increased length.

Sentence final punctuation generated with:

```
unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}'
```

See: https://stackoverflow.com/a/9508766/461847

Fixes #4269.
											
										
										
											2019-09-14 16:25:48 +03:00
+								    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
 								            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
 								            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
 								            '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
 								            '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
 								            '﹖', '﹗', '！', '．', '？', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
 								            '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
 								            '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
 								            '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
 								            '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈']
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
 								    def __init__(self, punct_chars=None, **kwargs):
 								        """Initialize the sentencizer.
 								        punct_chars (list): Punctuation characters to split on. Will be
 								            serialized with the nlp object.
 								        RETURNS (Sentencizer): The sentencizer component.
 								        DOCS: https://spacy.io/api/sentencizer#init
 								        """
-												Extend default punct for sentencizer (#4290)

Most of these characters are for languages / writing systems that aren't
supported by spacy, but I don't think it causes problems to include
them. In the UD evals, Hindi and Urdu improve a lot as expected (from
0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil
improves in combination with #4288.

The punctuation list is converted to a set internally because of its
increased length.

Sentence final punctuation generated with:

```
unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}'
```

See: https://stackoverflow.com/a/9508766/461847

Fixes #4269.
											
										
										
											2019-09-14 16:25:48 +03:00
+								        if punct_chars:
 								            self.punct_chars = set(punct_chars)
 								        else:
 								            self.punct_chars = set(self.default_punct_chars)
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
 								    def __call__(self, doc):
 								        """Apply the sentencizer to a Doc and set Token.is_sent_start.
 								        doc (Doc): The document to process.
 								        RETURNS (Doc): The processed Doc.
 								        DOCS: https://spacy.io/api/sentencizer#call
 								        """
 								        start = 0
 								        seen_period = False
 								        for i, token in enumerate(doc):
 								            is_in_punct_chars = token.text in self.punct_chars
 								            token.is_sent_start = i == 0
 								            if seen_period and not token.is_punct and not is_in_punct_chars:
 								                doc[start].is_sent_start = True
 								                start = token.i
 								                seen_period = False
 								            elif is_in_punct_chars:
 								                seen_period = True
 								        if start < len(doc):
 								            doc[start].is_sent_start = True
 								        return doc
 								    def to_bytes(self, **kwargs):
 								        """Serialize the sentencizer to a bytestring.
 								        RETURNS (bytes): The serialized object.
 								        DOCS: https://spacy.io/api/sentencizer#to_bytes
 								        """
-												Extend default punct for sentencizer (#4290)

Most of these characters are for languages / writing systems that aren't
supported by spacy, but I don't think it causes problems to include
them. In the UD evals, Hindi and Urdu improve a lot as expected (from
0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil
improves in combination with #4288.

The punctuation list is converted to a set internally because of its
increased length.

Sentence final punctuation generated with:

```
unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}'
```

See: https://stackoverflow.com/a/9508766/461847

Fixes #4269.
											
										
										
											2019-09-14 16:25:48 +03:00
+								        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
 								    def from_bytes(self, bytes_data, **kwargs):
 								        """Load the sentencizer from a bytestring.
 								        bytes_data (bytes): The data to load.
 								        returns (Sentencizer): The loaded object.
 								        DOCS: https://spacy.io/api/sentencizer#from_bytes
 								        """
 								        cfg = srsly.msgpack_loads(bytes_data)
-												Extend default punct for sentencizer (#4290)

Most of these characters are for languages / writing systems that aren't
supported by spacy, but I don't think it causes problems to include
them. In the UD evals, Hindi and Urdu improve a lot as expected (from
0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil
improves in combination with #4288.

The punctuation list is converted to a set internally because of its
increased length.

Sentence final punctuation generated with:

```
unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}'
```

See: https://stackoverflow.com/a/9508766/461847

Fixes #4269.
											
										
										
											2019-09-14 16:25:48 +03:00
+								        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
+								        return self
 								    def to_disk(self, path, exclude=tuple(), **kwargs):
 								        """Serialize the sentencizer to disk.
 								        DOCS: https://spacy.io/api/sentencizer#to_disk
 								        """
 								        path = util.ensure_path(path)
 								        path = path.with_suffix(".json")
-												Extend default punct for sentencizer (#4290)

Most of these characters are for languages / writing systems that aren't
supported by spacy, but I don't think it causes problems to include
them. In the UD evals, Hindi and Urdu improve a lot as expected (from
0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil
improves in combination with #4288.

The punctuation list is converted to a set internally because of its
increased length.

Sentence final punctuation generated with:

```
unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}'
```

See: https://stackoverflow.com/a/9508766/461847

Fixes #4269.
											
										
										
											2019-09-14 16:25:48 +03:00
+								        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
 								    def from_disk(self, path, exclude=tuple(), **kwargs):
 								        """Load the sentencizer from disk.
 								        DOCS: https://spacy.io/api/sentencizer#from_disk
 								        """
 								        path = util.ensure_path(path)
 								        path = path.with_suffix(".json")
 								        cfg = srsly.read_json(path)
-												Extend default punct for sentencizer (#4290)

Most of these characters are for languages / writing systems that aren't
supported by spacy, but I don't think it causes problems to include
them. In the UD evals, Hindi and Urdu improve a lot as expected (from
0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil
improves in combination with #4288.

The punctuation list is converted to a set internally because of its
increased length.

Sentence final punctuation generated with:

```
unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}'
```

See: https://stackoverflow.com/a/9508766/461847

Fixes #4269.
											
										
										
											2019-09-14 16:25:48 +03:00
+								        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
-												💫 Add better and serializable sentencizer (#3471)

* Add better serializable sentencizer component

* Replace default factory

* Add tests

* Tidy up

* Pass test

* Update docs

											
										
										
											2019-03-23 17:45:02 +03:00
+								        return self
-												Auto-format [ci skip]

											
										
										
											2019-06-26 15:48:09 +03:00
-												Merge branch 'master' into feature/el-framework
											
										
										
											2019-03-26 13:00:02 +03:00
+								__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]