mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Add text-classification hook to pipeline
This commit is contained in:
		
							parent
							
								
									7ea50182a5
								
							
						
					
					
						commit
						a231b56d40
					
				| 
						 | 
					@ -42,10 +42,89 @@ from .compat import json_dumps
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 | 
					from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 | 
				
			||||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
 | 
					from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
 | 
				
			||||||
 | 
					from ._ml import build_text_classifier
 | 
				
			||||||
from .parts_of_speech import X
 | 
					from .parts_of_speech import X
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TokenVectorEncoder(object):
 | 
					class BaseThincComponent(object):
 | 
				
			||||||
 | 
					    name = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def Model(cls, *shape, **kwargs):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, vocab, model=True, **cfg):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, doc):
 | 
				
			||||||
 | 
					        scores = self.predict([doc])
 | 
				
			||||||
 | 
					        self.set_annotations([doc], scores)
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
 | 
					        for docs in cytoolz.partition_all(batch_size, stream):
 | 
				
			||||||
 | 
					            docs = list(docs)
 | 
				
			||||||
 | 
					            scores = self.predict(docs)
 | 
				
			||||||
 | 
					            self.set_annotations(docs, scores)
 | 
				
			||||||
 | 
					            yield from docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def predict(self, docs):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_annotations(self, docs, scores):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_loss(self, docs, golds, scores):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def begin_training(self, gold_tuples, pipeline=None):
 | 
				
			||||||
 | 
					        token_vector_width = pipeline[0].model.nO
 | 
				
			||||||
 | 
					        if self.model is True:
 | 
				
			||||||
 | 
					            self.model = self.Model(1, token_vector_width)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def use_params(self, params):
 | 
				
			||||||
 | 
					        with self.model.use_params(params):
 | 
				
			||||||
 | 
					            yield
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_bytes(self, **exclude):
 | 
				
			||||||
 | 
					        serialize = OrderedDict((
 | 
				
			||||||
 | 
					            ('model', lambda: self.model.to_bytes()),
 | 
				
			||||||
 | 
					            ('vocab', lambda: self.vocab.to_bytes())
 | 
				
			||||||
 | 
					        ))
 | 
				
			||||||
 | 
					        return util.to_bytes(serialize, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_bytes(self, bytes_data, **exclude):
 | 
				
			||||||
 | 
					        if self.model is True:
 | 
				
			||||||
 | 
					            self.model = self.Model()
 | 
				
			||||||
 | 
					        deserialize = OrderedDict((
 | 
				
			||||||
 | 
					            ('model', lambda b: self.model.from_bytes(b)),
 | 
				
			||||||
 | 
					            ('vocab', lambda b: self.vocab.from_bytes(b))
 | 
				
			||||||
 | 
					        ))
 | 
				
			||||||
 | 
					        util.from_bytes(bytes_data, deserialize, exclude)
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_disk(self, path, **exclude):
 | 
				
			||||||
 | 
					        serialize = OrderedDict((
 | 
				
			||||||
 | 
					            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
 | 
				
			||||||
 | 
					            ('vocab', lambda p: self.vocab.to_disk(p))
 | 
				
			||||||
 | 
					        ))
 | 
				
			||||||
 | 
					        util.to_disk(path, serialize, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_disk(self, path, **exclude):
 | 
				
			||||||
 | 
					        if self.model is True:
 | 
				
			||||||
 | 
					            self.model = self.Model()
 | 
				
			||||||
 | 
					        deserialize = OrderedDict((
 | 
				
			||||||
 | 
					            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
 | 
				
			||||||
 | 
					            ('vocab', lambda p: self.vocab.from_disk(p))
 | 
				
			||||||
 | 
					        ))
 | 
				
			||||||
 | 
					        util.from_disk(path, deserialize, exclude)
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class TokenVectorEncoder(BaseThincComponent):
 | 
				
			||||||
    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
 | 
					    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
 | 
				
			||||||
    name = 'tensorizer'
 | 
					    name = 'tensorizer'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -155,51 +234,8 @@ class TokenVectorEncoder(object):
 | 
				
			||||||
        if self.model is True:
 | 
					        if self.model is True:
 | 
				
			||||||
            self.model = self.Model()
 | 
					            self.model = self.Model()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def use_params(self, params):
 | 
					 | 
				
			||||||
        """Replace weights of models in the pipeline with those provided in the
 | 
					 | 
				
			||||||
        params dictionary.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        params (dict): A dictionary of parameters keyed by model ID.
 | 
					class NeuralTagger(BaseThincComponent):
 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        with self.model.use_params(params):
 | 
					 | 
				
			||||||
            yield
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_bytes(self, **exclude):
 | 
					 | 
				
			||||||
        serialize = OrderedDict((
 | 
					 | 
				
			||||||
            ('model', lambda: self.model.to_bytes()),
 | 
					 | 
				
			||||||
            ('vocab', lambda: self.vocab.to_bytes())
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        return util.to_bytes(serialize, exclude)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def from_bytes(self, bytes_data, **exclude):
 | 
					 | 
				
			||||||
        if self.model is True:
 | 
					 | 
				
			||||||
            self.model = self.Model()
 | 
					 | 
				
			||||||
        deserialize = OrderedDict((
 | 
					 | 
				
			||||||
            ('model', lambda b: self.model.from_bytes(b)),
 | 
					 | 
				
			||||||
            ('vocab', lambda b: self.vocab.from_bytes(b))
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        util.from_bytes(bytes_data, deserialize, exclude)
 | 
					 | 
				
			||||||
        return self
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_disk(self, path, **exclude):
 | 
					 | 
				
			||||||
        serialize = OrderedDict((
 | 
					 | 
				
			||||||
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
 | 
					 | 
				
			||||||
            ('vocab', lambda p: self.vocab.to_disk(p))
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        util.to_disk(path, serialize, exclude)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def from_disk(self, path, **exclude):
 | 
					 | 
				
			||||||
        if self.model is True:
 | 
					 | 
				
			||||||
            self.model = self.Model()
 | 
					 | 
				
			||||||
        deserialize = OrderedDict((
 | 
					 | 
				
			||||||
            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
 | 
					 | 
				
			||||||
            ('vocab', lambda p: self.vocab.from_disk(p))
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        util.from_disk(path, deserialize, exclude)
 | 
					 | 
				
			||||||
        return self
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class NeuralTagger(object):
 | 
					 | 
				
			||||||
    name = 'tagger'
 | 
					    name = 'tagger'
 | 
				
			||||||
    def __init__(self, vocab, model=True):
 | 
					    def __init__(self, vocab, model=True):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
| 
						 | 
					@ -252,7 +288,6 @@ class NeuralTagger(object):
 | 
				
			||||||
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
 | 
					        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
 | 
					        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        return d_tokvecs
 | 
					        return d_tokvecs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_loss(self, docs, golds, scores):
 | 
					    def get_loss(self, docs, golds, scores):
 | 
				
			||||||
| 
						 | 
					@ -423,7 +458,7 @@ class NeuralLabeller(NeuralTagger):
 | 
				
			||||||
        return float(loss), d_scores
 | 
					        return float(loss), d_scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SimilarityHook(object):
 | 
					class SimilarityHook(BaseThincComponent):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Experimental
 | 
					    Experimental
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -477,48 +512,65 @@ class SimilarityHook(object):
 | 
				
			||||||
        if self.model is True:
 | 
					        if self.model is True:
 | 
				
			||||||
            self.model = self.Model(pipeline[0].model.nO)
 | 
					            self.model = self.Model(pipeline[0].model.nO)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def use_params(self, params):
 | 
					 | 
				
			||||||
        """Replace weights of models in the pipeline with those provided in the
 | 
					 | 
				
			||||||
        params dictionary.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        params (dict): A dictionary of parameters keyed by model ID.
 | 
					class TextClassifier(BaseThincComponent):
 | 
				
			||||||
        """
 | 
					    name = 'text-classifier'
 | 
				
			||||||
        with self.model.use_params(params):
 | 
					 | 
				
			||||||
            yield
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self, **exclude):
 | 
					    @classmethod
 | 
				
			||||||
        serialize = OrderedDict((
 | 
					    def Model(cls, nr_class, width=64, **cfg):
 | 
				
			||||||
            ('model', lambda: self.model.to_bytes()),
 | 
					        return build_text_classifier(nr_class, width, **cfg)
 | 
				
			||||||
            ('vocab', lambda: self.vocab.to_bytes())
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        return util.to_bytes(serialize, exclude)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data, **exclude):
 | 
					    def __init__(self, vocab, model=True, **cfg):
 | 
				
			||||||
 | 
					        self.vocab = vocab
 | 
				
			||||||
 | 
					        self.model = model
 | 
				
			||||||
 | 
					        self.labels = cfg.get('labels', ['LABEL'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __call__(self, doc):
 | 
				
			||||||
 | 
					        scores = self.predict([doc])
 | 
				
			||||||
 | 
					        self.set_annotations([doc], scores)
 | 
				
			||||||
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
 | 
					        for docs in cytoolz.partition_all(batch_size, stream):
 | 
				
			||||||
 | 
					            docs = list(docs)
 | 
				
			||||||
 | 
					            scores = self.predict(docs)
 | 
				
			||||||
 | 
					            self.set_annotations(docs, scores)
 | 
				
			||||||
 | 
					            yield from docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def predict(self, docs):
 | 
				
			||||||
 | 
					        scores = self.model(docs)
 | 
				
			||||||
 | 
					        scores = self.model.ops.asarray(scores)
 | 
				
			||||||
 | 
					        return scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def set_annotations(self, docs, scores):
 | 
				
			||||||
 | 
					        for i, doc in enumerate(docs):
 | 
				
			||||||
 | 
					            for j, label in self.labels:
 | 
				
			||||||
 | 
					                doc.cats[label] = float(scores[i, j])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
 | 
				
			||||||
 | 
					        docs, tensors = docs_tensors
 | 
				
			||||||
 | 
					        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 | 
				
			||||||
 | 
					        loss, d_scores = self.get_loss(docs, golds, scores)
 | 
				
			||||||
 | 
					        d_tensors = bp_scores(d_scores, sgd=sgd)
 | 
				
			||||||
 | 
					        if losses is not None:
 | 
				
			||||||
 | 
					            losses.setdefault(self.name, 0.0)
 | 
				
			||||||
 | 
					            losses[self.name] += loss
 | 
				
			||||||
 | 
					        return d_tensors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_loss(self, docs, golds, scores):
 | 
				
			||||||
 | 
					        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
 | 
				
			||||||
 | 
					        for i, gold in enumerate(golds):
 | 
				
			||||||
 | 
					            for j, label in enumerate(self.labels):
 | 
				
			||||||
 | 
					                truths[i, j] = label in gold.cats
 | 
				
			||||||
 | 
					        truths = self.model.ops.asarray(truths)
 | 
				
			||||||
 | 
					        d_scores = (scores-truths) / scores.shape[0]
 | 
				
			||||||
 | 
					        mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
 | 
				
			||||||
 | 
					        return mean_square_error, d_scores
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def begin_training(self, gold_tuples, pipeline=None):
 | 
				
			||||||
 | 
					        token_vector_width = pipeline[0].model.nO
 | 
				
			||||||
        if self.model is True:
 | 
					        if self.model is True:
 | 
				
			||||||
            self.model = self.Model()
 | 
					            self.model = self.Model(len(self.labels), token_vector_width)
 | 
				
			||||||
        deserialize = OrderedDict((
 | 
					 | 
				
			||||||
            ('model', lambda b: self.model.from_bytes(b)),
 | 
					 | 
				
			||||||
            ('vocab', lambda b: self.vocab.from_bytes(b))
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        util.from_bytes(bytes_data, deserialize, exclude)
 | 
					 | 
				
			||||||
        return self
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def to_disk(self, path, **exclude):
 | 
					 | 
				
			||||||
        serialize = OrderedDict((
 | 
					 | 
				
			||||||
            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
 | 
					 | 
				
			||||||
            ('vocab', lambda p: self.vocab.to_disk(p))
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        util.to_disk(path, serialize, exclude)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def from_disk(self, path, **exclude):
 | 
					 | 
				
			||||||
        if self.model is True:
 | 
					 | 
				
			||||||
            self.model = self.Model()
 | 
					 | 
				
			||||||
        deserialize = OrderedDict((
 | 
					 | 
				
			||||||
            ('model', lambda p: self.model.from_bytes(p.open('rb').read())),
 | 
					 | 
				
			||||||
            ('vocab', lambda p: self.vocab.from_disk(p))
 | 
					 | 
				
			||||||
        ))
 | 
					 | 
				
			||||||
        util.from_disk(path, deserialize, exclude)
 | 
					 | 
				
			||||||
        return self
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EntityRecognizer(LinearParser):
 | 
					cdef class EntityRecognizer(LinearParser):
 | 
				
			||||||
| 
						 | 
					@ -569,6 +621,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    nr_feature = 6
 | 
					    nr_feature = 6
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def predict_confidences(self, docs):
 | 
				
			||||||
 | 
					        tensors = [d.tensor for d in docs]
 | 
				
			||||||
 | 
					        samples = []
 | 
				
			||||||
 | 
					        for i in range(10):
 | 
				
			||||||
 | 
					            states = self.parse_batch(docs, tensors, drop=0.3)
 | 
				
			||||||
 | 
					            for state in states:
 | 
				
			||||||
 | 
					                samples.append(self._get_entities(state))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
 | 
					        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user