# cython: infer_types=True # cython: profile=True # coding: utf8 from __future__ import unicode_literals import numpy cimport numpy as np import cytoolz from collections import OrderedDict import ujson from .util import msgpack from .util import msgpack_numpy from thinc.api import chain from thinc.v2v import Affine, SELU, Softmax from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural.util import to_categorical, copy_array from thinc.neural._classes.difference import Siamese, CauchySimilarity from .tokens.doc cimport Doc from .syntax.nn_parser cimport Parser from .syntax import nonproj from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager from .morphology cimport Morphology from .vocab cimport Vocab from .syntax import nonproj from .compat import json_dumps from .attrs import POS from .parts_of_speech import X from ._ml import Tok2Vec, build_text_classifier, build_tagger_model from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import create_default_optimizer from . import util class SentenceSegmenter(object): """A simple spaCy hook, to allow custom sentence boundary detection logic (that doesn't require the dependency parse). To change the sentence boundary detection strategy, pass a generator function `strategy` on initialization, or assign a new strategy to the .strategy attribute. Sentence detection strategies should be generators that take `Doc` objects and yield `Span` objects for each sentence. """ name = 'sbd' def __init__(self, vocab, strategy=None): self.vocab = vocab if strategy is None or strategy == 'on_punct': strategy = self.split_on_punct self.strategy = strategy def __call__(self, doc): doc.user_hooks['sents'] = self.strategy return doc @staticmethod def split_on_punct(doc): start = 0 seen_period = False for i, word in enumerate(doc): if seen_period and not word.is_punct: yield doc[start:word.i] start = word.i seen_period = False elif word.text in ['.', '!', '?']: seen_period = True if start < len(doc): yield doc[start:len(doc)] def merge_noun_chunks(doc): """Merge noun chunks into a single token. doc (Doc): The Doc object. RETURNS (Doc): The Doc object with merged noun chunks. """ if not doc.is_parsed: return spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks] for start, end, tag, dep in spans: doc.merge(start, end, tag=tag, dep=dep) return doc def merge_entities(doc): """Merge entities into a single token. doc (Doc): The Doc object. RETURNS (Doc): The Doc object with merged noun entities. """ spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents] for start, end, tag, dep, ent_type in spans: doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type) return doc class Pipe(object): """This class is not instantiated directly. Components inherit from it, and it defines the interface that components should follow to function as components in a spaCy analysis pipeline. """ name = None @classmethod def Model(cls, *shape, **kwargs): """Initialize a model for the pipe.""" raise NotImplementedError def __init__(self, vocab, model=True, **cfg): """Create a new pipe instance.""" raise NotImplementedError def __call__(self, doc): """Apply the pipe to one document. The document is modified in-place, and returned. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ scores, tensors = self.predict([doc]) self.set_annotations([doc], scores, tensors=tensors) return doc def pipe(self, stream, batch_size=128, n_threads=-1): """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensor=tensors) yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without modifying them. """ raise NotImplementedError def set_annotations(self, docs, scores, tensors=None): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError def update(self, docs, golds, drop=0., sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to predict() and get_loss(). """ raise NotImplementedError def get_loss(self, docs, golds, scores): """Find the loss and gradient of loss for the batch of documents and their predicted scores.""" raise NotImplementedError def add_label(self, label): """Add an output label, to be predicted by the model. It's possible to extend pre-trained models with new labels, but care should be taken to avoid the "catastrophic forgetting" problem. """ raise NotImplementedError def create_optimizer(self): return create_default_optimizer(self.model.ops, **self.cfg.get('optimizer', {})) def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, **kwargs): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" if self.model is True: self.model = self.Model(**self.cfg) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd def use_params(self, params): """Modify the pipe's model, to use the given parameter values.""" with self.model.use_params(params): yield def to_bytes(self, **exclude): """Serialize the pipe to a bytestring.""" serialize = OrderedDict() serialize['cfg'] = lambda: json_dumps(self.cfg) if self.model in (True, False, None): serialize['model'] = lambda: self.model else: serialize['model'] = self.model.to_bytes serialize['vocab'] = self.vocab.to_bytes return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): """Load the pipe from a bytestring.""" def load_model(b): # TODO: Remove this once we don't have to handle previous models if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: self.cfg['pretrained_vectors'] = self.vocab.vectors.name if self.model is True: self.model = self.Model(**self.cfg) self.model.from_bytes(b) deserialize = OrderedDict(( ('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('vocab', lambda b: self.vocab.from_bytes(b)), ('model', load_model), )) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, **exclude): """Serialize the pipe to disk.""" serialize = OrderedDict() serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg)) serialize['vocab'] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): """Load the pipe from disk.""" def load_model(p): # TODO: Remove this once we don't have to handle previous models if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: self.cfg['pretrained_vectors'] = self.vocab.vectors.name if self.model is True: self.model = self.Model(**self.cfg) self.model.from_bytes(p.open('rb').read()) deserialize = OrderedDict(( ('cfg', lambda p: self.cfg.update(_load_cfg(p))), ('vocab', lambda p: self.vocab.from_disk(p)), ('model', load_model), )) util.from_disk(path, deserialize, exclude) return self def _load_cfg(path): if path.exists(): with path.open() as file_: return ujson.load(file_) else: return {} class Tensorizer(Pipe): """Assign position-sensitive vectors to tokens, using a CNN or RNN.""" name = 'tensorizer' @classmethod def Model(cls, output_size=300, input_size=384, **cfg): """Create a new statistical model for the class. width (int): Output size of the model. embed_size (int): Number of vectors in the embedding table. **cfg: Config parameters. RETURNS (Model): A `thinc.neural.Model` or similar instance. """ model = chain( SELU(output_size, input_size), SELU(output_size, output_size), zero_init(Affine(output_size, output_size))) return model def __init__(self, vocab, model=True, **cfg): """Construct a new statistical model. Weights are not allocated on initialisation. vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` instance with the `Doc` objects it will process. model (Model): A `Model` instance or `True` allocate one later. **cfg: Config parameters. EXAMPLE: >>> from spacy.pipeline import TokenVectorEncoder >>> tok2vec = TokenVectorEncoder(nlp.vocab) >>> tok2vec.model = tok2vec.Model(128, 5000) """ self.vocab = vocab self.model = model self.input_models = [] self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 3) def __call__(self, doc): """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM model. Vectors are set to the `Doc.tensor` attribute. docs (Doc or iterable): One or more documents to add vectors to. RETURNS (dict or None): Intermediate computations. """ tokvecses = self.predict([doc]) self.set_annotations([doc], tokvecses) return doc def pipe(self, stream, batch_size=128, n_threads=-1): """Process `Doc` objects as a stream. stream (iterator): A sequence of `Doc` objects to process. batch_size (int): Number of `Doc` objects to group. n_threads (int): Number of threads. YIELDS (iterator): A sequence of `Doc` objects, in order of input. """ for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) tensors = self.predict(docs) self.set_annotations(docs, tensors) yield from docs def predict(self, docs): """Return a single tensor for a batch of documents. docs (iterable): A sequence of `Doc` objects. RETURNS (object): Vector representations for each token in the docs. """ inputs = self.model.ops.flatten([doc.tensor for doc in docs]) outputs = self.model(inputs) return self.model.ops.unflatten(outputs, [len(d) for d in docs]) def set_annotations(self, docs, tensors): """Set the tensor attribute for a batch of documents. docs (iterable): A sequence of `Doc` objects. tensors (object): Vector representation for each token in the docs. """ for doc, tensor in zip(docs, tensors): assert tensor.shape[0] == len(doc) doc.tensor = tensor def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): """Update the model. docs (iterable): A batch of `Doc` objects. golds (iterable): A batch of `GoldParse` objects. drop (float): The droput rate. sgd (callable): An optimizer. RETURNS (dict): Results from the update. """ if isinstance(docs, Doc): docs = [docs] inputs = [] bp_inputs = [] for tok2vec in self.input_models: tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop) inputs.append(tensor) bp_inputs.append(bp_tensor) inputs = self.model.ops.xp.hstack(inputs) scores, bp_scores = self.model.begin_update(inputs, drop=drop) loss, d_scores = self.get_loss(docs, golds, scores) d_inputs = bp_scores(d_scores, sgd=sgd) d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) for d_input, bp_input in zip(d_inputs, bp_inputs): bp_input(d_input, sgd=sgd) if losses is not None: losses.setdefault(self.name, 0.) losses[self.name] += loss return loss def get_loss(self, docs, golds, prediction): target = [] i = 0 for doc in docs: vectors = self.model.ops.xp.vstack([w.vector for w in doc]) target.append(vectors) target = self.model.ops.xp.vstack(target) d_scores = (prediction - target) / prediction.shape[0] loss = (d_scores**2).sum() return loss, d_scores def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, **kwargs): """Allocate models, pre-process training data and acquire an optimizer. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. """ for name, model in pipeline: if getattr(model, 'tok2vec', None): self.input_models.append(model.tok2vec) if self.model is True: self.cfg['input_size'] = 384 self.cfg['output_size'] = 300 self.model = self.Model(**self.cfg) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd class Tagger(Pipe): name = 'tagger' def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = OrderedDict(sorted(cfg.items())) self.cfg.setdefault('cnn_maxout_pieces', 2) @property def labels(self): return self.vocab.morphology.tag_names @property def tok2vec(self): if self.model in (None, True, False): return None else: return chain(self.model.tok2vec, flatten) def __call__(self, doc): tags, tokvecs = self.predict([doc]) self.set_annotations([doc], tags, tensors=tokvecs) return doc def pipe(self, stream, batch_size=128, n_threads=-1): for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) tag_ids, tokvecs = self.predict(docs) self.set_annotations(docs, tag_ids, tensors=tokvecs) yield from docs def predict(self, docs): tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) guesses = [] for doc_scores in scores: doc_guesses = doc_scores.argmax(axis=1) if not isinstance(doc_guesses, numpy.ndarray): doc_guesses = doc_guesses.get() guesses.append(doc_guesses) return guesses, tokvecs def set_annotations(self, docs, batch_tag_ids, tensors=None): if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef int idx = 0 cdef Vocab vocab = self.vocab for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, 'get'): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0 and doc.c[j].pos == 0: # Don't clobber preset lemmas lemma = doc.c[j].lemma vocab.morphology.assign_tag_id(&doc.c[j], tag_id) if lemma != 0 and lemma != doc.c[j].lex.orth: doc.c[j].lemma = lemma idx += 1 if tensors is not None: if isinstance(doc.tensor, numpy.ndarray) \ and not isinstance(tensors[i], numpy.ndarray): doc.extend_tensor(tensors[i].get()) else: doc.extend_tensor(tensors[i]) doc.is_tagged = True def update(self, docs, golds, drop=0., sgd=None, losses=None): if losses is not None and self.name not in losses: losses[self.name] = 0. tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) bp_tag_scores(d_tag_scores, sgd=sgd) if losses is not None: losses[self.name] += loss def get_loss(self, docs, golds, scores): scores = self.model.ops.flatten(scores) tag_index = {tag: i for i, tag in enumerate(self.labels)} cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') guesses = scores.argmax(axis=1) for gold in golds: for tag in gold.tags: if tag is None: correct[idx] = guesses[idx] else: correct[idx] = tag_index[tag] idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, **kwargs): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in gold_tuples: for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots for tag in tags: if tag in orig_tag_map: new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') if self.model is True: self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @classmethod def Model(cls, n_tags, **cfg): if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): raise ValueError( "Bad configuration of Tagger --- this is probably a bug " "within spaCy. We changed the name of an internal attribute " "for loading pre-trained vectors, and the class has been " "passed the old name (pretrained_dims) but not the new name " "(pretrained_vectors)") return build_tagger_model(n_tags, **cfg) def add_label(self, label, values=None): if label in self.labels: return 0 if self.model not in (True, False, None): # Here's how the model resizing will work, once the # neuron-to-tag mapping is no longer controlled by # the Morphology class, which sorts the tag names. # The sorting makes adding labels difficult. # smaller = self.model._layers[-1] # larger = Softmax(len(self.labels)+1, smaller.nI) # copy_array(larger.W[:smaller.nO], smaller.W) # copy_array(larger.b[:smaller.nO], smaller.b) # self.model._layers[-1] = larger raise ValueError( "Resizing pre-trained Tagger models is not " "currently supported.") tag_map = dict(self.vocab.morphology.tag_map) if values is None: values = {POS: "X"} tag_map[label] = values self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) return 1 def use_params(self, params): with self.model.use_params(params): yield def to_bytes(self, **exclude): serialize = OrderedDict() if self.model in (None, True, False): serialize['model'] = lambda: self.model else: serialize['model'] = self.model.to_bytes serialize['vocab'] = self.vocab.to_bytes serialize['cfg'] = lambda: ujson.dumps(self.cfg) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) serialize['tag_map'] = lambda: msgpack.dumps( tag_map, use_bin_type=True, encoding='utf8') return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): def load_model(b): # TODO: Remove this once we don't have to handle previous models if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: self.cfg['pretrained_vectors'] = self.vocab.vectors.name if self.model is True: token_vector_width = util.env_opt( 'token_vector_width', self.cfg.get('token_vector_width', 128)) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model.from_bytes(b) def load_tag_map(b): tag_map = msgpack.loads(b, encoding='utf8') self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) deserialize = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('tag_map', load_tag_map), ('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('model', lambda b: load_model(b)), )) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, **exclude): tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) serialize = OrderedDict(( ('vocab', lambda p: self.vocab.to_disk(p)), ('tag_map', lambda p: p.open('wb').write(msgpack.dumps( tag_map, use_bin_type=True, encoding='utf8'))), ('model', lambda p: p.open('wb').write(self.model.to_bytes())), ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))) )) util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): def load_model(p): # TODO: Remove this once we don't have to handle previous models if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg: self.cfg['pretrained_vectors'] = self.vocab.vectors.name if self.model is True: self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) with p.open('rb') as file_: self.model.from_bytes(file_.read()) def load_tag_map(p): with p.open('rb') as file_: tag_map = msgpack.loads(file_.read(), encoding='utf8') self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) deserialize = OrderedDict(( ('cfg', lambda p: self.cfg.update(_load_cfg(p))), ('vocab', lambda p: self.vocab.from_disk(p)), ('tag_map', load_tag_map), ('model', load_model), )) util.from_disk(path, deserialize, exclude) return self class MultitaskObjective(Tagger): """Experimental: Assist training of a parser or tagger, by training a side-objective. """ name = 'nn_labeller' def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model if target == 'dep': self.make_label = self.make_dep elif target == 'tag': self.make_label = self.make_tag elif target == 'ent': self.make_label = self.make_ent elif target == 'dep_tag_offset': self.make_label = self.make_dep_tag_offset elif target == 'ent_tag': self.make_label = self.make_ent_tag elif hasattr(target, '__call__'): self.make_label = target else: raise ValueError("MultitaskObjective target should be function or " "one of: dep, tag, ent, dep_tag_offset, ent_tag.") self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) @property def labels(self): return self.cfg.setdefault('labels', {}) @labels.setter def labels(self, value): self.cfg['labels'] = value def set_annotations(self, docs, dep_ids, tensors=None): pass def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None, sgd=None, **kwargs): gold_tuples = nonproj.preprocess_training_data(gold_tuples) for raw_text, annots_brackets in gold_tuples: for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots for i in range(len(ids)): label = self.make_label(i, words, tags, heads, deps, ents) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) if self.model is True: token_vector_width = util.env_opt('token_vector_width') self.model = self.Model(len(self.labels), tok2vec=tok2vec) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @classmethod def Model(cls, n_tags, tok2vec=None, **cfg): token_vector_width = util.env_opt('token_vector_width', 128) softmax = Softmax(n_tags, token_vector_width) model = chain( tok2vec, softmax ) model.tok2vec = tok2vec model.softmax = softmax return model def predict(self, docs): tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) return tokvecs, scores def get_loss(self, docs, golds, scores): assert len(docs) == len(golds) cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') guesses = scores.argmax(axis=1) for i, gold in enumerate(golds): for j in range(len(docs[i])): # Handes alignment for tokenization differences gold_idx = gold.cand_to_gold[j] if gold_idx is None: idx += 1 continue label = self.make_label(gold_idx, gold.words, gold.tags, gold.heads, gold.labels, gold.ents) if label is None or label not in self.labels: correct[idx] = guesses[idx] else: correct[idx] = self.labels[label] idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() return float(loss), d_scores @staticmethod def make_dep(i, words, tags, heads, deps, ents): if deps[i] is None or heads[i] is None: return None return deps[i] @staticmethod def make_tag(i, words, tags, heads, deps, ents): return tags[i] @staticmethod def make_ent(i, words, tags, heads, deps, ents): if ents is None: return None return ents[i] @staticmethod def make_dep_tag_offset(i, words, tags, heads, deps, ents): if deps[i] is None or heads[i] is None: return None offset = heads[i] - i offset = min(offset, 2) offset = max(offset, -2) return '%s-%s:%d' % (deps[i], tags[i], offset) @staticmethod def make_ent_tag(i, words, tags, heads, deps, ents): if ents is None or ents[i] is None: return None else: return '%s-%s' % (tags[i], ents[i]) class SimilarityHook(Pipe): """ Experimental: A pipeline component to install a hook for supervised similarity into `Doc` objects. Requires a `Tensorizer` to pre-process documents. The similarity model can be any object obeying the Thinc `Model` interface. By default, the model concatenates the elementwise mean and elementwise max of the two tensors, and compares them using the Cauchy-like similarity function from Chen (2013): >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum()) Where W is a vector of dimension weights, initialized to 1. """ name = 'similarity' def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = dict(cfg) @classmethod def Model(cls, length): return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) def __call__(self, doc): """Install similarity hook""" doc.user_hooks['similarity'] = self.predict return doc def pipe(self, docs, **kwargs): for doc in docs: yield self(doc) def predict(self, doc1, doc2): return self.model.predict([(doc1, doc2)]) def update(self, doc1_doc2, golds, sgd=None, drop=0.): sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): """Allocate model, using width from tensorizer in pipeline. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. """ if self.model is True: self.model = self.Model(pipeline[0].model.nO) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd class TextCategorizer(Pipe): name = 'textcat' @classmethod def Model(cls, nr_class=1, width=64, **cfg): return build_text_classifier(nr_class, width, **cfg) def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = dict(cfg) @property def labels(self): return self.cfg.setdefault('labels', []) @labels.setter def labels(self, value): self.cfg['labels'] = value def __call__(self, doc): scores, tensors = self.predict([doc]) self.set_annotations([doc], scores, tensors=tensors) return doc def pipe(self, stream, batch_size=128, n_threads=-1): for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) yield from docs def predict(self, docs): scores = self.model(docs) scores = self.model.ops.asarray(scores) tensors = [doc.tensor for doc in docs] return scores, tensors def set_annotations(self, docs, scores, tensors=None): for i, doc in enumerate(docs): for j, label in enumerate(self.labels): doc.cats[label] = float(scores[i, j]) def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): scores, bp_scores = self.model.begin_update(docs, drop=drop) loss, d_scores = self.get_loss(docs, golds, scores) bp_scores(d_scores, sgd=sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += loss def get_loss(self, docs, golds, scores): truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') for i, gold in enumerate(golds): for j, label in enumerate(self.labels): if label in gold.cats: truths[i, j] = gold.cats[label] else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) not_missing = self.model.ops.asarray(not_missing) d_scores = (scores-truths) / scores.shape[0] d_scores *= not_missing mean_square_error = ((scores-truths)**2).sum(axis=1).mean() return mean_square_error, d_scores def add_label(self, label): if label in self.labels: return 0 if self.model not in (None, True, False): smaller = self.model._layers[-1] larger = Affine(len(self.labels)+1, smaller.nI) copy_array(larger.W[:smaller.nO], smaller.W) copy_array(larger.b[:smaller.nO], smaller.b) self.model._layers[-1] = larger self.labels.append(label) return 1 def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None, **kwargs): if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': token_vector_width = pipeline[0].model.nO else: token_vector_width = 64 if self.model is True: self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') self.model = self.Model(len(self.labels), token_vector_width, **self.cfg) link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd cdef class DependencyParser(Parser): name = 'parser' TransitionSystem = ArcEager @property def postprocesses(self): return [nonproj.deprojectivize] def add_multitask_objective(self, target): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model[0] labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec, sgd=sgd) def __reduce__(self): return (DependencyParser, (self.vocab, self.moves, self.model), None, None) cdef class EntityRecognizer(Parser): name = 'ner' TransitionSystem = BiluoPushDown nr_feature = 6 def add_multitask_objective(self, target): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg): for labeller in self._multitasks: tok2vec = self.model[0] labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) def __reduce__(self): return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None) __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']