mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			973 lines
		
	
	
		
			35 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			973 lines
		
	
	
		
			35 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
# cython: infer_types=True
 | 
						|
# cython: profile=True
 | 
						|
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import numpy
 | 
						|
cimport numpy as np
 | 
						|
import cytoolz
 | 
						|
from collections import OrderedDict
 | 
						|
import ujson
 | 
						|
 | 
						|
from .util import msgpack
 | 
						|
from .util import msgpack_numpy
 | 
						|
 | 
						|
from thinc.api import chain
 | 
						|
from thinc.v2v import Affine, SELU, Softmax
 | 
						|
from thinc.t2v import Pooling, max_pool, mean_pool
 | 
						|
from thinc.neural.util import to_categorical, copy_array
 | 
						|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
 | 
						|
 | 
						|
from .tokens.doc cimport Doc
 | 
						|
from .syntax.nn_parser cimport Parser
 | 
						|
from .syntax import nonproj
 | 
						|
from .syntax.ner cimport BiluoPushDown
 | 
						|
from .syntax.arc_eager cimport ArcEager
 | 
						|
from .morphology cimport Morphology
 | 
						|
from .vocab cimport Vocab
 | 
						|
from .syntax import nonproj
 | 
						|
from .compat import json_dumps
 | 
						|
 | 
						|
from .attrs import POS
 | 
						|
from .parts_of_speech import X
 | 
						|
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
 | 
						|
from ._ml import link_vectors_to_models, zero_init, flatten
 | 
						|
from ._ml import create_default_optimizer
 | 
						|
from .errors import Errors, TempErrors
 | 
						|
from . import util
 | 
						|
 | 
						|
 | 
						|
class SentenceSegmenter(object):
 | 
						|
    """A simple spaCy hook, to allow custom sentence boundary detection logic
 | 
						|
    (that doesn't require the dependency parse). To change the sentence
 | 
						|
    boundary detection strategy, pass a generator function `strategy` on
 | 
						|
    initialization, or assign a new strategy to the .strategy attribute.
 | 
						|
    Sentence detection strategies should be generators that take `Doc` objects
 | 
						|
    and yield `Span` objects for each sentence.
 | 
						|
    """
 | 
						|
    name = 'sbd'
 | 
						|
 | 
						|
    def __init__(self, vocab, strategy=None):
 | 
						|
        self.vocab = vocab
 | 
						|
        if strategy is None or strategy == 'on_punct':
 | 
						|
            strategy = self.split_on_punct
 | 
						|
        self.strategy = strategy
 | 
						|
 | 
						|
    def __call__(self, doc):
 | 
						|
        doc.user_hooks['sents'] = self.strategy
 | 
						|
        return doc
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def split_on_punct(doc):
 | 
						|
        start = 0
 | 
						|
        seen_period = False
 | 
						|
        for i, word in enumerate(doc):
 | 
						|
            if seen_period and not word.is_punct:
 | 
						|
                yield doc[start:word.i]
 | 
						|
                start = word.i
 | 
						|
                seen_period = False
 | 
						|
            elif word.text in ['.', '!', '?']:
 | 
						|
                seen_period = True
 | 
						|
        if start < len(doc):
 | 
						|
            yield doc[start:len(doc)]
 | 
						|
 | 
						|
 | 
						|
def merge_noun_chunks(doc):
 | 
						|
    """Merge noun chunks into a single token.
 | 
						|
 | 
						|
    doc (Doc): The Doc object.
 | 
						|
    RETURNS (Doc): The Doc object with merged noun chunks.
 | 
						|
    """
 | 
						|
    if not doc.is_parsed:
 | 
						|
        return doc
 | 
						|
    spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
 | 
						|
             for np in doc.noun_chunks]
 | 
						|
    for start, end, tag, dep in spans:
 | 
						|
        doc.merge(start, end, tag=tag, dep=dep)
 | 
						|
    return doc
 | 
						|
 | 
						|
 | 
						|
def merge_entities(doc):
 | 
						|
    """Merge entities into a single token.
 | 
						|
 | 
						|
    doc (Doc): The Doc object.
 | 
						|
    RETURNS (Doc): The Doc object with merged noun entities.
 | 
						|
    """
 | 
						|
    spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
 | 
						|
             for e in doc.ents]
 | 
						|
    for start, end, tag, dep, ent_type in spans:
 | 
						|
        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
 | 
						|
    return doc
 | 
						|
 | 
						|
 | 
						|
class Pipe(object):
 | 
						|
    """This class is not instantiated directly. Components inherit from it, and
 | 
						|
    it defines the interface that components should follow to function as
 | 
						|
    components in a spaCy analysis pipeline.
 | 
						|
    """
 | 
						|
    name = None
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def Model(cls, *shape, **kwargs):
 | 
						|
        """Initialize a model for the pipe."""
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def __init__(self, vocab, model=True, **cfg):
 | 
						|
        """Create a new pipe instance."""
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def __call__(self, doc):
 | 
						|
        """Apply the pipe to one document. The document is
 | 
						|
        modified in-place, and returned.
 | 
						|
 | 
						|
        Both __call__ and pipe should delegate to the `predict()`
 | 
						|
        and `set_annotations()` methods.
 | 
						|
        """
 | 
						|
        scores, tensors = self.predict([doc])
 | 
						|
        self.set_annotations([doc], scores, tensors=tensors)
 | 
						|
        return doc
 | 
						|
 | 
						|
    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
						|
        """Apply the pipe to a stream of documents.
 | 
						|
 | 
						|
        Both __call__ and pipe should delegate to the `predict()`
 | 
						|
        and `set_annotations()` methods.
 | 
						|
        """
 | 
						|
        for docs in cytoolz.partition_all(batch_size, stream):
 | 
						|
            docs = list(docs)
 | 
						|
            scores, tensors = self.predict(docs)
 | 
						|
            self.set_annotations(docs, scores, tensor=tensors)
 | 
						|
            yield from docs
 | 
						|
 | 
						|
    def predict(self, docs):
 | 
						|
        """Apply the pipeline's model to a batch of docs, without
 | 
						|
        modifying them.
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def set_annotations(self, docs, scores, tensors=None):
 | 
						|
        """Modify a batch of documents, using pre-computed scores."""
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def update(self, docs, golds, drop=0., sgd=None, losses=None):
 | 
						|
        """Learn from a batch of documents and gold-standard information,
 | 
						|
        updating the pipe's model.
 | 
						|
 | 
						|
        Delegates to predict() and get_loss().
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def get_loss(self, docs, golds, scores):
 | 
						|
        """Find the loss and gradient of loss for the batch of
 | 
						|
        documents and their predicted scores."""
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def add_label(self, label):
 | 
						|
        """Add an output label, to be predicted by the model.
 | 
						|
 | 
						|
        It's possible to extend pre-trained models with new labels,
 | 
						|
        but care should be taken to avoid the "catastrophic forgetting"
 | 
						|
        problem.
 | 
						|
        """
 | 
						|
        raise NotImplementedError
 | 
						|
 | 
						|
    def create_optimizer(self):
 | 
						|
        return create_default_optimizer(self.model.ops,
 | 
						|
                                        **self.cfg.get('optimizer', {}))
 | 
						|
 | 
						|
    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
 | 
						|
                       **kwargs):
 | 
						|
        """Initialize the pipe for training, using data exampes if available.
 | 
						|
        If no model has been initialized yet, the model is added."""
 | 
						|
        if self.model is True:
 | 
						|
            self.model = self.Model(**self.cfg)
 | 
						|
        link_vectors_to_models(self.vocab)
 | 
						|
        if sgd is None:
 | 
						|
            sgd = self.create_optimizer()
 | 
						|
        return sgd
 | 
						|
 | 
						|
    def use_params(self, params):
 | 
						|
        """Modify the pipe's model, to use the given parameter values."""
 | 
						|
        with self.model.use_params(params):
 | 
						|
            yield
 | 
						|
 | 
						|
    def to_bytes(self, **exclude):
 | 
						|
        """Serialize the pipe to a bytestring."""
 | 
						|
        serialize = OrderedDict()
 | 
						|
        serialize['cfg'] = lambda: json_dumps(self.cfg)
 | 
						|
        if self.model in (True, False, None):
 | 
						|
            serialize['model'] = lambda: self.model
 | 
						|
        else:
 | 
						|
            serialize['model'] = self.model.to_bytes
 | 
						|
        serialize['vocab'] = self.vocab.to_bytes
 | 
						|
        return util.to_bytes(serialize, exclude)
 | 
						|
 | 
						|
    def from_bytes(self, bytes_data, **exclude):
 | 
						|
        """Load the pipe from a bytestring."""
 | 
						|
        def load_model(b):
 | 
						|
            # TODO: Remove this once we don't have to handle previous models
 | 
						|
            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
 | 
						|
                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
						|
            if self.model is True:
 | 
						|
                self.model = self.Model(**self.cfg)
 | 
						|
            self.model.from_bytes(b)
 | 
						|
 | 
						|
        deserialize = OrderedDict((
 | 
						|
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
 | 
						|
            ('vocab', lambda b: self.vocab.from_bytes(b)),
 | 
						|
            ('model', load_model),
 | 
						|
        ))
 | 
						|
        util.from_bytes(bytes_data, deserialize, exclude)
 | 
						|
        return self
 | 
						|
 | 
						|
    def to_disk(self, path, **exclude):
 | 
						|
        """Serialize the pipe to disk."""
 | 
						|
        serialize = OrderedDict()
 | 
						|
        serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
 | 
						|
        serialize['vocab'] = lambda p: self.vocab.to_disk(p)
 | 
						|
        if self.model not in (None, True, False):
 | 
						|
            serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
 | 
						|
        util.to_disk(path, serialize, exclude)
 | 
						|
 | 
						|
    def from_disk(self, path, **exclude):
 | 
						|
        """Load the pipe from disk."""
 | 
						|
        def load_model(p):
 | 
						|
            # TODO: Remove this once we don't have to handle previous models
 | 
						|
            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
 | 
						|
                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
						|
            if self.model is True:
 | 
						|
                self.model = self.Model(**self.cfg)
 | 
						|
            self.model.from_bytes(p.open('rb').read())
 | 
						|
 | 
						|
        deserialize = OrderedDict((
 | 
						|
            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
 | 
						|
            ('vocab', lambda p: self.vocab.from_disk(p)),
 | 
						|
            ('model', load_model),
 | 
						|
        ))
 | 
						|
        util.from_disk(path, deserialize, exclude)
 | 
						|
        return self
 | 
						|
 | 
						|
 | 
						|
def _load_cfg(path):
 | 
						|
    if path.exists():
 | 
						|
        with path.open() as file_:
 | 
						|
            return ujson.load(file_)
 | 
						|
    else:
 | 
						|
        return {}
 | 
						|
 | 
						|
 | 
						|
class Tensorizer(Pipe):
 | 
						|
    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
 | 
						|
    name = 'tensorizer'
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def Model(cls, output_size=300, input_size=384, **cfg):
 | 
						|
        """Create a new statistical model for the class.
 | 
						|
 | 
						|
        width (int): Output size of the model.
 | 
						|
        embed_size (int): Number of vectors in the embedding table.
 | 
						|
        **cfg: Config parameters.
 | 
						|
        RETURNS (Model): A `thinc.neural.Model` or similar instance.
 | 
						|
        """
 | 
						|
        model = chain(
 | 
						|
                    SELU(output_size, input_size),
 | 
						|
                    SELU(output_size, output_size),
 | 
						|
                    zero_init(Affine(output_size, output_size)))
 | 
						|
        return model
 | 
						|
 | 
						|
    def __init__(self, vocab, model=True, **cfg):
 | 
						|
        """Construct a new statistical model. Weights are not allocated on
 | 
						|
        initialisation.
 | 
						|
 | 
						|
        vocab (Vocab): A `Vocab` instance. The model must share the same
 | 
						|
            `Vocab` instance with the `Doc` objects it will process.
 | 
						|
        model (Model): A `Model` instance or `True` allocate one later.
 | 
						|
        **cfg: Config parameters.
 | 
						|
 | 
						|
        EXAMPLE:
 | 
						|
            >>> from spacy.pipeline import TokenVectorEncoder
 | 
						|
            >>> tok2vec = TokenVectorEncoder(nlp.vocab)
 | 
						|
            >>> tok2vec.model = tok2vec.Model(128, 5000)
 | 
						|
        """
 | 
						|
        self.vocab = vocab
 | 
						|
        self.model = model
 | 
						|
        self.input_models = []
 | 
						|
        self.cfg = dict(cfg)
 | 
						|
        self.cfg.setdefault('cnn_maxout_pieces', 3)
 | 
						|
 | 
						|
    def __call__(self, doc):
 | 
						|
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
 | 
						|
        model. Vectors are set to the `Doc.tensor` attribute.
 | 
						|
 | 
						|
        docs (Doc or iterable): One or more documents to add vectors to.
 | 
						|
        RETURNS (dict or None): Intermediate computations.
 | 
						|
        """
 | 
						|
        tokvecses = self.predict([doc])
 | 
						|
        self.set_annotations([doc], tokvecses)
 | 
						|
        return doc
 | 
						|
 | 
						|
    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
						|
        """Process `Doc` objects as a stream.
 | 
						|
 | 
						|
        stream (iterator): A sequence of `Doc` objects to process.
 | 
						|
        batch_size (int): Number of `Doc` objects to group.
 | 
						|
        n_threads (int): Number of threads.
 | 
						|
        YIELDS (iterator): A sequence of `Doc` objects, in order of input.
 | 
						|
        """
 | 
						|
        for docs in cytoolz.partition_all(batch_size, stream):
 | 
						|
            docs = list(docs)
 | 
						|
            tensors = self.predict(docs)
 | 
						|
            self.set_annotations(docs, tensors)
 | 
						|
            yield from docs
 | 
						|
 | 
						|
    def predict(self, docs):
 | 
						|
        """Return a single tensor for a batch of documents.
 | 
						|
 | 
						|
        docs (iterable): A sequence of `Doc` objects.
 | 
						|
        RETURNS (object): Vector representations for each token in the docs.
 | 
						|
        """
 | 
						|
        inputs = self.model.ops.flatten([doc.tensor for doc in docs])
 | 
						|
        outputs = self.model(inputs)
 | 
						|
        return self.model.ops.unflatten(outputs, [len(d) for d in docs])
 | 
						|
 | 
						|
    def set_annotations(self, docs, tensors):
 | 
						|
        """Set the tensor attribute for a batch of documents.
 | 
						|
 | 
						|
        docs (iterable): A sequence of `Doc` objects.
 | 
						|
        tensors (object): Vector representation for each token in the docs.
 | 
						|
        """
 | 
						|
        for doc, tensor in zip(docs, tensors):
 | 
						|
            if tensor.shape[0] != len(doc):
 | 
						|
                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
 | 
						|
            doc.tensor = tensor
 | 
						|
 | 
						|
    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
 | 
						|
        """Update the model.
 | 
						|
 | 
						|
        docs (iterable): A batch of `Doc` objects.
 | 
						|
        golds (iterable): A batch of `GoldParse` objects.
 | 
						|
        drop (float): The droput rate.
 | 
						|
        sgd (callable): An optimizer.
 | 
						|
        RETURNS (dict): Results from the update.
 | 
						|
        """
 | 
						|
        if isinstance(docs, Doc):
 | 
						|
            docs = [docs]
 | 
						|
        inputs = []
 | 
						|
        bp_inputs = []
 | 
						|
        for tok2vec in self.input_models:
 | 
						|
            tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
 | 
						|
            inputs.append(tensor)
 | 
						|
            bp_inputs.append(bp_tensor)
 | 
						|
        inputs = self.model.ops.xp.hstack(inputs)
 | 
						|
        scores, bp_scores = self.model.begin_update(inputs, drop=drop)
 | 
						|
        loss, d_scores = self.get_loss(docs, golds, scores)
 | 
						|
        d_inputs = bp_scores(d_scores, sgd=sgd)
 | 
						|
        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
 | 
						|
        for d_input, bp_input in zip(d_inputs, bp_inputs):
 | 
						|
            bp_input(d_input, sgd=sgd)
 | 
						|
        if losses is not None:
 | 
						|
            losses.setdefault(self.name, 0.)
 | 
						|
            losses[self.name] += loss
 | 
						|
        return loss
 | 
						|
 | 
						|
    def get_loss(self, docs, golds, prediction):
 | 
						|
        target = []
 | 
						|
        i = 0
 | 
						|
        for doc in docs:
 | 
						|
            vectors = self.model.ops.xp.vstack([w.vector for w in doc])
 | 
						|
            target.append(vectors)
 | 
						|
        target = self.model.ops.xp.vstack(target)
 | 
						|
        d_scores = (prediction - target) / prediction.shape[0]
 | 
						|
        loss = (d_scores**2).sum()
 | 
						|
        return loss, d_scores
 | 
						|
 | 
						|
    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
 | 
						|
                        **kwargs):
 | 
						|
        """Allocate models, pre-process training data and acquire an
 | 
						|
        optimizer.
 | 
						|
 | 
						|
        gold_tuples (iterable): Gold-standard training data.
 | 
						|
        pipeline (list): The pipeline the model is part of.
 | 
						|
        """
 | 
						|
        for name, model in pipeline:
 | 
						|
            if getattr(model, 'tok2vec', None):
 | 
						|
                self.input_models.append(model.tok2vec)
 | 
						|
        if self.model is True:
 | 
						|
            self.cfg['input_size'] = 384
 | 
						|
            self.cfg['output_size'] = 300
 | 
						|
            self.model = self.Model(**self.cfg)
 | 
						|
        link_vectors_to_models(self.vocab)
 | 
						|
        if sgd is None:
 | 
						|
            sgd = self.create_optimizer()
 | 
						|
        return sgd
 | 
						|
 | 
						|
 | 
						|
class Tagger(Pipe):
 | 
						|
    name = 'tagger'
 | 
						|
 | 
						|
    def __init__(self, vocab, model=True, **cfg):
 | 
						|
        self.vocab = vocab
 | 
						|
        self.model = model
 | 
						|
        self.cfg = OrderedDict(sorted(cfg.items()))
 | 
						|
        self.cfg.setdefault('cnn_maxout_pieces', 2)
 | 
						|
 | 
						|
    @property
 | 
						|
    def labels(self):
 | 
						|
        return self.vocab.morphology.tag_names
 | 
						|
 | 
						|
    @property
 | 
						|
    def tok2vec(self):
 | 
						|
        if self.model in (None, True, False):
 | 
						|
            return None
 | 
						|
        else:
 | 
						|
            return chain(self.model.tok2vec, flatten)
 | 
						|
 | 
						|
    def __call__(self, doc):
 | 
						|
        tags, tokvecs = self.predict([doc])
 | 
						|
        self.set_annotations([doc], tags, tensors=tokvecs)
 | 
						|
        return doc
 | 
						|
 | 
						|
    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
						|
        for docs in cytoolz.partition_all(batch_size, stream):
 | 
						|
            docs = list(docs)
 | 
						|
            tag_ids, tokvecs = self.predict(docs)
 | 
						|
            self.set_annotations(docs, tag_ids, tensors=tokvecs)
 | 
						|
            yield from docs
 | 
						|
 | 
						|
    def predict(self, docs):
 | 
						|
        tokvecs = self.model.tok2vec(docs)
 | 
						|
        scores = self.model.softmax(tokvecs)
 | 
						|
        guesses = []
 | 
						|
        for doc_scores in scores:
 | 
						|
            doc_guesses = doc_scores.argmax(axis=1)
 | 
						|
            if not isinstance(doc_guesses, numpy.ndarray):
 | 
						|
                doc_guesses = doc_guesses.get()
 | 
						|
            guesses.append(doc_guesses)
 | 
						|
        return guesses, tokvecs
 | 
						|
 | 
						|
    def set_annotations(self, docs, batch_tag_ids, tensors=None):
 | 
						|
        if isinstance(docs, Doc):
 | 
						|
            docs = [docs]
 | 
						|
        cdef Doc doc
 | 
						|
        cdef int idx = 0
 | 
						|
        cdef Vocab vocab = self.vocab
 | 
						|
        for i, doc in enumerate(docs):
 | 
						|
            doc_tag_ids = batch_tag_ids[i]
 | 
						|
            if hasattr(doc_tag_ids, 'get'):
 | 
						|
                doc_tag_ids = doc_tag_ids.get()
 | 
						|
            for j, tag_id in enumerate(doc_tag_ids):
 | 
						|
                # Don't clobber preset POS tags
 | 
						|
                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
 | 
						|
                    # Don't clobber preset lemmas
 | 
						|
                    lemma = doc.c[j].lemma
 | 
						|
                    vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
 | 
						|
                    if lemma != 0 and lemma != doc.c[j].lex.orth:
 | 
						|
                        doc.c[j].lemma = lemma
 | 
						|
                idx += 1
 | 
						|
            if tensors is not None:
 | 
						|
                if isinstance(doc.tensor, numpy.ndarray) \
 | 
						|
                and not isinstance(tensors[i], numpy.ndarray):
 | 
						|
                    doc.extend_tensor(tensors[i].get())
 | 
						|
                else:
 | 
						|
                    doc.extend_tensor(tensors[i])
 | 
						|
            doc.is_tagged = True
 | 
						|
 | 
						|
    def update(self, docs, golds, drop=0., sgd=None, losses=None):
 | 
						|
        if losses is not None and self.name not in losses:
 | 
						|
            losses[self.name] = 0.
 | 
						|
 | 
						|
        tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
 | 
						|
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
 | 
						|
        bp_tag_scores(d_tag_scores, sgd=sgd)
 | 
						|
 | 
						|
        if losses is not None:
 | 
						|
            losses[self.name] += loss
 | 
						|
 | 
						|
    def get_loss(self, docs, golds, scores):
 | 
						|
        scores = self.model.ops.flatten(scores)
 | 
						|
        tag_index = {tag: i for i, tag in enumerate(self.labels)}
 | 
						|
        cdef int idx = 0
 | 
						|
        correct = numpy.zeros((scores.shape[0],), dtype='i')
 | 
						|
        guesses = scores.argmax(axis=1)
 | 
						|
        for gold in golds:
 | 
						|
            for tag in gold.tags:
 | 
						|
                if tag is None:
 | 
						|
                    correct[idx] = guesses[idx]
 | 
						|
                else:
 | 
						|
                    correct[idx] = tag_index[tag]
 | 
						|
                idx += 1
 | 
						|
        correct = self.model.ops.xp.array(correct, dtype='i')
 | 
						|
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
 | 
						|
        d_scores /= d_scores.shape[0]
 | 
						|
        loss = (d_scores**2).sum()
 | 
						|
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
 | 
						|
        return float(loss), d_scores
 | 
						|
 | 
						|
    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
 | 
						|
                       **kwargs):
 | 
						|
        orig_tag_map = dict(self.vocab.morphology.tag_map)
 | 
						|
        new_tag_map = OrderedDict()
 | 
						|
        for raw_text, annots_brackets in gold_tuples:
 | 
						|
            for annots, brackets in annots_brackets:
 | 
						|
                ids, words, tags, heads, deps, ents = annots
 | 
						|
                for tag in tags:
 | 
						|
                    if tag in orig_tag_map:
 | 
						|
                        new_tag_map[tag] = orig_tag_map[tag]
 | 
						|
                    else:
 | 
						|
                        new_tag_map[tag] = {POS: X}
 | 
						|
        cdef Vocab vocab = self.vocab
 | 
						|
        if new_tag_map:
 | 
						|
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
 | 
						|
                                          vocab.morphology.lemmatizer,
 | 
						|
                                          exc=vocab.morphology.exc)
 | 
						|
        self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
 | 
						|
        if self.model is True:
 | 
						|
            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
 | 
						|
        link_vectors_to_models(self.vocab)
 | 
						|
        if sgd is None:
 | 
						|
            sgd = self.create_optimizer()
 | 
						|
        return sgd
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def Model(cls, n_tags, **cfg):
 | 
						|
        if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
 | 
						|
            raise ValueError(TempErrors.T008)
 | 
						|
        return build_tagger_model(n_tags, **cfg)
 | 
						|
 | 
						|
    def add_label(self, label, values=None):
 | 
						|
        if label in self.labels:
 | 
						|
            return 0
 | 
						|
        if self.model not in (True, False, None):
 | 
						|
            # Here's how the model resizing will work, once the
 | 
						|
            # neuron-to-tag mapping is no longer controlled by
 | 
						|
            # the Morphology class, which sorts the tag names.
 | 
						|
            # The sorting makes adding labels difficult.
 | 
						|
            # smaller = self.model._layers[-1]
 | 
						|
            # larger = Softmax(len(self.labels)+1, smaller.nI)
 | 
						|
            # copy_array(larger.W[:smaller.nO], smaller.W)
 | 
						|
            # copy_array(larger.b[:smaller.nO], smaller.b)
 | 
						|
            # self.model._layers[-1] = larger
 | 
						|
            raise ValueError(TempErrors.T003)
 | 
						|
        tag_map = dict(self.vocab.morphology.tag_map)
 | 
						|
        if values is None:
 | 
						|
            values = {POS: "X"}
 | 
						|
        tag_map[label] = values
 | 
						|
        self.vocab.morphology = Morphology(
 | 
						|
            self.vocab.strings, tag_map=tag_map,
 | 
						|
            lemmatizer=self.vocab.morphology.lemmatizer,
 | 
						|
            exc=self.vocab.morphology.exc)
 | 
						|
        return 1
 | 
						|
 | 
						|
    def use_params(self, params):
 | 
						|
        with self.model.use_params(params):
 | 
						|
            yield
 | 
						|
 | 
						|
    def to_bytes(self, **exclude):
 | 
						|
        serialize = OrderedDict()
 | 
						|
        if self.model in (None, True, False):
 | 
						|
            serialize['model'] = lambda: self.model
 | 
						|
        else:
 | 
						|
            serialize['model'] = self.model.to_bytes
 | 
						|
        serialize['vocab'] = self.vocab.to_bytes
 | 
						|
        serialize['cfg'] = lambda: ujson.dumps(self.cfg)
 | 
						|
        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
 | 
						|
        serialize['tag_map'] = lambda: msgpack.dumps(
 | 
						|
            tag_map, use_bin_type=True, encoding='utf8')
 | 
						|
        return util.to_bytes(serialize, exclude)
 | 
						|
 | 
						|
    def from_bytes(self, bytes_data, **exclude):
 | 
						|
        def load_model(b):
 | 
						|
            # TODO: Remove this once we don't have to handle previous models
 | 
						|
            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
 | 
						|
                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
						|
 | 
						|
            if self.model is True:
 | 
						|
                token_vector_width = util.env_opt(
 | 
						|
                    'token_vector_width',
 | 
						|
                    self.cfg.get('token_vector_width', 128))
 | 
						|
                self.model = self.Model(self.vocab.morphology.n_tags,
 | 
						|
                                        **self.cfg)
 | 
						|
            self.model.from_bytes(b)
 | 
						|
 | 
						|
        def load_tag_map(b):
 | 
						|
            tag_map = msgpack.loads(b, encoding='utf8')
 | 
						|
            self.vocab.morphology = Morphology(
 | 
						|
                self.vocab.strings, tag_map=tag_map,
 | 
						|
                lemmatizer=self.vocab.morphology.lemmatizer,
 | 
						|
                exc=self.vocab.morphology.exc)
 | 
						|
 | 
						|
        deserialize = OrderedDict((
 | 
						|
            ('vocab', lambda b: self.vocab.from_bytes(b)),
 | 
						|
            ('tag_map', load_tag_map),
 | 
						|
            ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
 | 
						|
            ('model', lambda b: load_model(b)),
 | 
						|
        ))
 | 
						|
        util.from_bytes(bytes_data, deserialize, exclude)
 | 
						|
        return self
 | 
						|
 | 
						|
    def to_disk(self, path, **exclude):
 | 
						|
        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
 | 
						|
        serialize = OrderedDict((
 | 
						|
            ('vocab', lambda p: self.vocab.to_disk(p)),
 | 
						|
            ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
 | 
						|
                tag_map, use_bin_type=True, encoding='utf8'))),
 | 
						|
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
 | 
						|
            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
 | 
						|
        ))
 | 
						|
        util.to_disk(path, serialize, exclude)
 | 
						|
 | 
						|
    def from_disk(self, path, **exclude):
 | 
						|
        def load_model(p):
 | 
						|
            # TODO: Remove this once we don't have to handle previous models
 | 
						|
            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
 | 
						|
                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
						|
            if self.model is True:
 | 
						|
                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
 | 
						|
            with p.open('rb') as file_:
 | 
						|
                self.model.from_bytes(file_.read())
 | 
						|
 | 
						|
        def load_tag_map(p):
 | 
						|
            with p.open('rb') as file_:
 | 
						|
                tag_map = msgpack.loads(file_.read(), encoding='utf8')
 | 
						|
            self.vocab.morphology = Morphology(
 | 
						|
                self.vocab.strings, tag_map=tag_map,
 | 
						|
                lemmatizer=self.vocab.morphology.lemmatizer,
 | 
						|
                exc=self.vocab.morphology.exc)
 | 
						|
 | 
						|
        deserialize = OrderedDict((
 | 
						|
            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
 | 
						|
            ('vocab', lambda p: self.vocab.from_disk(p)),
 | 
						|
            ('tag_map', load_tag_map),
 | 
						|
            ('model', load_model),
 | 
						|
        ))
 | 
						|
        util.from_disk(path, deserialize, exclude)
 | 
						|
        return self
 | 
						|
 | 
						|
 | 
						|
class MultitaskObjective(Tagger):
 | 
						|
    """Experimental: Assist training of a parser or tagger, by training a
 | 
						|
    side-objective.
 | 
						|
    """
 | 
						|
    name = 'nn_labeller'
 | 
						|
 | 
						|
    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
 | 
						|
        self.vocab = vocab
 | 
						|
        self.model = model
 | 
						|
        if target == 'dep':
 | 
						|
            self.make_label = self.make_dep
 | 
						|
        elif target == 'tag':
 | 
						|
            self.make_label = self.make_tag
 | 
						|
        elif target == 'ent':
 | 
						|
            self.make_label = self.make_ent
 | 
						|
        elif target == 'dep_tag_offset':
 | 
						|
            self.make_label = self.make_dep_tag_offset
 | 
						|
        elif target == 'ent_tag':
 | 
						|
            self.make_label = self.make_ent_tag
 | 
						|
        elif hasattr(target, '__call__'):
 | 
						|
            self.make_label = target
 | 
						|
        else:
 | 
						|
            raise ValueError(Errors.E016)
 | 
						|
        self.cfg = dict(cfg)
 | 
						|
        self.cfg.setdefault('cnn_maxout_pieces', 2)
 | 
						|
 | 
						|
    @property
 | 
						|
    def labels(self):
 | 
						|
        return self.cfg.setdefault('labels', {})
 | 
						|
 | 
						|
    @labels.setter
 | 
						|
    def labels(self, value):
 | 
						|
        self.cfg['labels'] = value
 | 
						|
 | 
						|
    def set_annotations(self, docs, dep_ids, tensors=None):
 | 
						|
        pass
 | 
						|
 | 
						|
    def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None,
 | 
						|
                       sgd=None, **kwargs):
 | 
						|
        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
 | 
						|
        for raw_text, annots_brackets in gold_tuples:
 | 
						|
            for annots, brackets in annots_brackets:
 | 
						|
                ids, words, tags, heads, deps, ents = annots
 | 
						|
                for i in range(len(ids)):
 | 
						|
                    label = self.make_label(i, words, tags, heads, deps, ents)
 | 
						|
                    if label is not None and label not in self.labels:
 | 
						|
                        self.labels[label] = len(self.labels)
 | 
						|
        if self.model is True:
 | 
						|
            token_vector_width = util.env_opt('token_vector_width')
 | 
						|
            self.model = self.Model(len(self.labels), tok2vec=tok2vec)
 | 
						|
        link_vectors_to_models(self.vocab)
 | 
						|
        if sgd is None:
 | 
						|
            sgd = self.create_optimizer()
 | 
						|
        return sgd
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def Model(cls, n_tags, tok2vec=None, **cfg):
 | 
						|
        token_vector_width = util.env_opt('token_vector_width', 128)
 | 
						|
        softmax = Softmax(n_tags, token_vector_width)
 | 
						|
        model = chain(
 | 
						|
            tok2vec,
 | 
						|
            softmax
 | 
						|
        )
 | 
						|
        model.tok2vec = tok2vec
 | 
						|
        model.softmax = softmax
 | 
						|
        return model
 | 
						|
 | 
						|
    def predict(self, docs):
 | 
						|
        tokvecs = self.model.tok2vec(docs)
 | 
						|
        scores = self.model.softmax(tokvecs)
 | 
						|
        return tokvecs, scores
 | 
						|
 | 
						|
    def get_loss(self, docs, golds, scores):
 | 
						|
        if len(docs) != len(golds):
 | 
						|
            raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs),
 | 
						|
                                                n_golds=len(golds)))
 | 
						|
        cdef int idx = 0
 | 
						|
        correct = numpy.zeros((scores.shape[0],), dtype='i')
 | 
						|
        guesses = scores.argmax(axis=1)
 | 
						|
        for i, gold in enumerate(golds):
 | 
						|
            for j in range(len(docs[i])):
 | 
						|
                # Handes alignment for tokenization differences
 | 
						|
                gold_idx = gold.cand_to_gold[j]
 | 
						|
                if gold_idx is None:
 | 
						|
                    idx += 1
 | 
						|
                    continue
 | 
						|
                label = self.make_label(gold_idx, gold.words, gold.tags,
 | 
						|
                                        gold.heads, gold.labels, gold.ents)
 | 
						|
                if label is None or label not in self.labels:
 | 
						|
                    correct[idx] = guesses[idx]
 | 
						|
                else:
 | 
						|
                    correct[idx] = self.labels[label]
 | 
						|
                idx += 1
 | 
						|
        correct = self.model.ops.xp.array(correct, dtype='i')
 | 
						|
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
 | 
						|
        d_scores /= d_scores.shape[0]
 | 
						|
        loss = (d_scores**2).sum()
 | 
						|
        return float(loss), d_scores
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def make_dep(i, words, tags, heads, deps, ents):
 | 
						|
        if deps[i] is None or heads[i] is None:
 | 
						|
            return None
 | 
						|
        return deps[i]
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def make_tag(i, words, tags, heads, deps, ents):
 | 
						|
        return tags[i]
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def make_ent(i, words, tags, heads, deps, ents):
 | 
						|
        if ents is None:
 | 
						|
            return None
 | 
						|
        return ents[i]
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def make_dep_tag_offset(i, words, tags, heads, deps, ents):
 | 
						|
        if deps[i] is None or heads[i] is None:
 | 
						|
            return None
 | 
						|
        offset = heads[i] - i
 | 
						|
        offset = min(offset, 2)
 | 
						|
        offset = max(offset, -2)
 | 
						|
        return '%s-%s:%d' % (deps[i], tags[i], offset)
 | 
						|
 | 
						|
    @staticmethod
 | 
						|
    def make_ent_tag(i, words, tags, heads, deps, ents):
 | 
						|
        if ents is None or ents[i] is None:
 | 
						|
            return None
 | 
						|
        else:
 | 
						|
            return '%s-%s' % (tags[i], ents[i])
 | 
						|
 | 
						|
 | 
						|
class SimilarityHook(Pipe):
 | 
						|
    """
 | 
						|
    Experimental: A pipeline component to install a hook for supervised
 | 
						|
    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
 | 
						|
    documents. The similarity model can be any object obeying the Thinc `Model`
 | 
						|
    interface. By default, the model concatenates the elementwise mean and
 | 
						|
    elementwise max of the two tensors, and compares them using the
 | 
						|
    Cauchy-like similarity function from Chen (2013):
 | 
						|
 | 
						|
        >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
 | 
						|
 | 
						|
    Where W is a vector of dimension weights, initialized to 1.
 | 
						|
    """
 | 
						|
    name = 'similarity'
 | 
						|
 | 
						|
    def __init__(self, vocab, model=True, **cfg):
 | 
						|
        self.vocab = vocab
 | 
						|
        self.model = model
 | 
						|
        self.cfg = dict(cfg)
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def Model(cls, length):
 | 
						|
        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
 | 
						|
 | 
						|
    def __call__(self, doc):
 | 
						|
        """Install similarity hook"""
 | 
						|
        doc.user_hooks['similarity'] = self.predict
 | 
						|
        return doc
 | 
						|
 | 
						|
    def pipe(self, docs, **kwargs):
 | 
						|
        for doc in docs:
 | 
						|
            yield self(doc)
 | 
						|
 | 
						|
    def predict(self, doc1, doc2):
 | 
						|
        return self.model.predict([(doc1, doc2)])
 | 
						|
 | 
						|
    def update(self, doc1_doc2, golds, sgd=None, drop=0.):
 | 
						|
        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
 | 
						|
 | 
						|
    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
 | 
						|
        """Allocate model, using width from tensorizer in pipeline.
 | 
						|
 | 
						|
        gold_tuples (iterable): Gold-standard training data.
 | 
						|
        pipeline (list): The pipeline the model is part of.
 | 
						|
        """
 | 
						|
        if self.model is True:
 | 
						|
            self.model = self.Model(pipeline[0].model.nO)
 | 
						|
            link_vectors_to_models(self.vocab)
 | 
						|
        if sgd is None:
 | 
						|
            sgd = self.create_optimizer()
 | 
						|
        return sgd
 | 
						|
 | 
						|
 | 
						|
class TextCategorizer(Pipe):
 | 
						|
    name = 'textcat'
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def Model(cls, nr_class=1, width=64, **cfg):
 | 
						|
        return build_text_classifier(nr_class, width, **cfg)
 | 
						|
 | 
						|
    def __init__(self, vocab, model=True, **cfg):
 | 
						|
        self.vocab = vocab
 | 
						|
        self.model = model
 | 
						|
        self.cfg = dict(cfg)
 | 
						|
 | 
						|
    @property
 | 
						|
    def labels(self):
 | 
						|
        return self.cfg.setdefault('labels', [])
 | 
						|
 | 
						|
    @labels.setter
 | 
						|
    def labels(self, value):
 | 
						|
        self.cfg['labels'] = value
 | 
						|
 | 
						|
    def __call__(self, doc):
 | 
						|
        scores, tensors = self.predict([doc])
 | 
						|
        self.set_annotations([doc], scores, tensors=tensors)
 | 
						|
        return doc
 | 
						|
 | 
						|
    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
						|
        for docs in cytoolz.partition_all(batch_size, stream):
 | 
						|
            docs = list(docs)
 | 
						|
            scores, tensors = self.predict(docs)
 | 
						|
            self.set_annotations(docs, scores, tensors=tensors)
 | 
						|
            yield from docs
 | 
						|
 | 
						|
    def predict(self, docs):
 | 
						|
        scores = self.model(docs)
 | 
						|
        scores = self.model.ops.asarray(scores)
 | 
						|
        tensors = [doc.tensor for doc in docs]
 | 
						|
        return scores, tensors
 | 
						|
 | 
						|
    def set_annotations(self, docs, scores, tensors=None):
 | 
						|
        for i, doc in enumerate(docs):
 | 
						|
            for j, label in enumerate(self.labels):
 | 
						|
                doc.cats[label] = float(scores[i, j])
 | 
						|
 | 
						|
    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
 | 
						|
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 | 
						|
        loss, d_scores = self.get_loss(docs, golds, scores)
 | 
						|
        bp_scores(d_scores, sgd=sgd)
 | 
						|
        if losses is not None:
 | 
						|
            losses.setdefault(self.name, 0.0)
 | 
						|
            losses[self.name] += loss
 | 
						|
 | 
						|
    def get_loss(self, docs, golds, scores):
 | 
						|
        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
 | 
						|
        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
 | 
						|
        for i, gold in enumerate(golds):
 | 
						|
            for j, label in enumerate(self.labels):
 | 
						|
                if label in gold.cats:
 | 
						|
                    truths[i, j] = gold.cats[label]
 | 
						|
                else:
 | 
						|
                    not_missing[i, j] = 0.
 | 
						|
        truths = self.model.ops.asarray(truths)
 | 
						|
        not_missing = self.model.ops.asarray(not_missing)
 | 
						|
        d_scores = (scores-truths) / scores.shape[0]
 | 
						|
        d_scores *= not_missing
 | 
						|
        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
 | 
						|
        return mean_square_error, d_scores
 | 
						|
 | 
						|
    def add_label(self, label):
 | 
						|
        if label in self.labels:
 | 
						|
            return 0
 | 
						|
        if self.model not in (None, True, False):
 | 
						|
            smaller = self.model._layers[-1]
 | 
						|
            larger = Affine(len(self.labels)+1, smaller.nI)
 | 
						|
            copy_array(larger.W[:smaller.nO], smaller.W)
 | 
						|
            copy_array(larger.b[:smaller.nO], smaller.b)
 | 
						|
            self.model._layers[-1] = larger
 | 
						|
        self.labels.append(label)
 | 
						|
        return 1
 | 
						|
 | 
						|
    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
 | 
						|
                       **kwargs):
 | 
						|
        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
 | 
						|
            token_vector_width = pipeline[0].model.nO
 | 
						|
        else:
 | 
						|
            token_vector_width = 64
 | 
						|
 | 
						|
        if self.model is True:
 | 
						|
            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
 | 
						|
            self.model = self.Model(len(self.labels), token_vector_width,
 | 
						|
                                    **self.cfg)
 | 
						|
            link_vectors_to_models(self.vocab)
 | 
						|
        if sgd is None:
 | 
						|
            sgd = self.create_optimizer()
 | 
						|
        return sgd
 | 
						|
 | 
						|
 | 
						|
cdef class DependencyParser(Parser):
 | 
						|
    name = 'parser'
 | 
						|
    TransitionSystem = ArcEager
 | 
						|
 | 
						|
    @property
 | 
						|
    def postprocesses(self):
 | 
						|
        return [nonproj.deprojectivize]
 | 
						|
 | 
						|
    def add_multitask_objective(self, target):
 | 
						|
        labeller = MultitaskObjective(self.vocab, target=target)
 | 
						|
        self._multitasks.append(labeller)
 | 
						|
 | 
						|
    def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
 | 
						|
        for labeller in self._multitasks:
 | 
						|
            tok2vec = self.model[0]
 | 
						|
            labeller.begin_training(gold_tuples, pipeline=pipeline,
 | 
						|
                                    tok2vec=tok2vec, sgd=sgd)
 | 
						|
 | 
						|
    def __reduce__(self):
 | 
						|
        return (DependencyParser, (self.vocab, self.moves, self.model),
 | 
						|
                None, None)
 | 
						|
 | 
						|
 | 
						|
cdef class EntityRecognizer(Parser):
 | 
						|
    name = 'ner'
 | 
						|
    TransitionSystem = BiluoPushDown
 | 
						|
 | 
						|
    nr_feature = 6
 | 
						|
 | 
						|
    def add_multitask_objective(self, target):
 | 
						|
        labeller = MultitaskObjective(self.vocab, target=target)
 | 
						|
        self._multitasks.append(labeller)
 | 
						|
 | 
						|
    def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
 | 
						|
        for labeller in self._multitasks:
 | 
						|
            tok2vec = self.model[0]
 | 
						|
            labeller.begin_training(gold_tuples, pipeline=pipeline,
 | 
						|
                                    tok2vec=tok2vec)
 | 
						|
 | 
						|
    def __reduce__(self):
 | 
						|
        return (EntityRecognizer, (self.vocab, self.moves, self.model),
 | 
						|
                None, None)
 | 
						|
 | 
						|
 | 
						|
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
 |