mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						1d1f91a041
					
				|  | @ -26,8 +26,9 @@ from spacy.pipeline import TextCategorizer | ||||||
| @plac.annotations( | @plac.annotations( | ||||||
|     model=("Model name. Defaults to blank 'en' model.", "option", "m", str), |     model=("Model name. Defaults to blank 'en' model.", "option", "m", str), | ||||||
|     output_dir=("Optional output directory", "option", "o", Path), |     output_dir=("Optional output directory", "option", "o", Path), | ||||||
|  |     n_examples=("Number of texts to train from", "option", "N", int), | ||||||
|     n_iter=("Number of training iterations", "option", "n", int)) |     n_iter=("Number of training iterations", "option", "n", int)) | ||||||
| def main(model=None, output_dir=None, n_iter=20): | def main(model=None, output_dir=None, n_iter=20, n_texts=2000): | ||||||
|     if model is not None: |     if model is not None: | ||||||
|         nlp = spacy.load(model)  # load existing spaCy model |         nlp = spacy.load(model)  # load existing spaCy model | ||||||
|         print("Loaded model '%s'" % model) |         print("Loaded model '%s'" % model) | ||||||
|  | @ -50,7 +51,8 @@ def main(model=None, output_dir=None, n_iter=20): | ||||||
| 
 | 
 | ||||||
|     # load the IMBD dataset |     # load the IMBD dataset | ||||||
|     print("Loading IMDB data...") |     print("Loading IMDB data...") | ||||||
|     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000) |     print("Using %d training examples" % n_texts) | ||||||
|  |     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) | ||||||
|     train_docs = [nlp.tokenizer(text) for text in train_texts] |     train_docs = [nlp.tokenizer(text) for text in train_texts] | ||||||
|     train_gold = [GoldParse(doc, cats=cats) for doc, cats in |     train_gold = [GoldParse(doc, cats=cats) for doc, cats in | ||||||
|                   zip(train_docs, train_cats)] |                   zip(train_docs, train_cats)] | ||||||
|  | @ -65,14 +67,14 @@ def main(model=None, output_dir=None, n_iter=20): | ||||||
|         for i in range(n_iter): |         for i in range(n_iter): | ||||||
|             losses = {} |             losses = {} | ||||||
|             # batch up the examples using spaCy's minibatch |             # batch up the examples using spaCy's minibatch | ||||||
|             batches = minibatch(train_data, size=compounding(4., 128., 1.001)) |             batches = minibatch(train_data, size=compounding(4., 32., 1.001)) | ||||||
|             for batch in batches: |             for batch in batches: | ||||||
|                 docs, golds = zip(*batch) |                 docs, golds = zip(*batch) | ||||||
|                 nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) |                 nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) | ||||||
|             with textcat.model.use_params(optimizer.averages): |             with textcat.model.use_params(optimizer.averages): | ||||||
|                 # evaluate on the dev data split off in load_data() |                 # evaluate on the dev data split off in load_data() | ||||||
|                 scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) |                 scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) | ||||||
|             print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}'  # print a simple table |             print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table | ||||||
|                   .format(losses['textcat'], scores['textcat_p'], |                   .format(losses['textcat'], scores['textcat_p'], | ||||||
|                           scores['textcat_r'], scores['textcat_f'])) |                           scores['textcat_r'], scores['textcat_f'])) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -434,7 +434,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | ||||||
|     pretrained_dims = cfg.get('pretrained_dims', 0) |     pretrained_dims = cfg.get('pretrained_dims', 0) | ||||||
|     with Model.define_operators({'>>': chain, '+': add, '|': concatenate, |     with Model.define_operators({'>>': chain, '+': add, '|': concatenate, | ||||||
|                                  '**': clone}): |                                  '**': clone}): | ||||||
|         if cfg.get('low_data'): |         if cfg.get('low_data') and pretrained_dims: | ||||||
|             model = ( |             model = ( | ||||||
|                 SpacyVectors |                 SpacyVectors | ||||||
|                 >> flatten_add_lengths |                 >> flatten_add_lengths | ||||||
|  |  | ||||||
|  | @ -11,9 +11,9 @@ import ujson | ||||||
| import msgpack | import msgpack | ||||||
| 
 | 
 | ||||||
| from thinc.api import chain | from thinc.api import chain | ||||||
| from thinc.v2v import Softmax | from thinc.v2v import Affine, Softmax | ||||||
| from thinc.t2v import Pooling, max_pool, mean_pool | from thinc.t2v import Pooling, max_pool, mean_pool | ||||||
| from thinc.neural.util import to_categorical | from thinc.neural.util import to_categorical, copy_array | ||||||
| from thinc.neural._classes.difference import Siamese, CauchySimilarity | from thinc.neural._classes.difference import Siamese, CauchySimilarity | ||||||
| 
 | 
 | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
|  | @ -130,6 +130,15 @@ class Pipe(object): | ||||||
|         documents and their predicted scores.""" |         documents and their predicted scores.""" | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|  |     def add_label(self, label): | ||||||
|  |         """Add an output label, to be predicted by the model. | ||||||
|  | 
 | ||||||
|  |         It's possible to extend pre-trained models with new labels, | ||||||
|  |         but care should be taken to avoid the "catastrophic forgetting" | ||||||
|  |         problem. | ||||||
|  |         """ | ||||||
|  |         raise NotImplementedError | ||||||
|  | 
 | ||||||
|     def begin_training(self, gold_tuples=tuple(), pipeline=None): |     def begin_training(self, gold_tuples=tuple(), pipeline=None): | ||||||
|         """Initialize the pipe for training, using data exampes if available. |         """Initialize the pipe for training, using data exampes if available. | ||||||
|         If no model has been initialized yet, the model is added.""" |         If no model has been initialized yet, the model is added.""" | ||||||
|  | @ -325,6 +334,14 @@ class Tagger(Pipe): | ||||||
|         self.cfg.setdefault('pretrained_dims', |         self.cfg.setdefault('pretrained_dims', | ||||||
|                             self.vocab.vectors.data.shape[1]) |                             self.vocab.vectors.data.shape[1]) | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def labels(self): | ||||||
|  |         return self.cfg.setdefault('tag_names', []) | ||||||
|  | 
 | ||||||
|  |     @labels.setter | ||||||
|  |     def labels(self, value): | ||||||
|  |         self.cfg['tag_names'] = value | ||||||
|  | 
 | ||||||
|     def __call__(self, doc): |     def __call__(self, doc): | ||||||
|         tags = self.predict([doc]) |         tags = self.predict([doc]) | ||||||
|         self.set_annotations([doc], tags) |         self.set_annotations([doc], tags) | ||||||
|  | @ -352,6 +369,7 @@ class Tagger(Pipe): | ||||||
|         cdef Doc doc |         cdef Doc doc | ||||||
|         cdef int idx = 0 |         cdef int idx = 0 | ||||||
|         cdef Vocab vocab = self.vocab |         cdef Vocab vocab = self.vocab | ||||||
|  |         tags = list(self.labels) | ||||||
|         for i, doc in enumerate(docs): |         for i, doc in enumerate(docs): | ||||||
|             doc_tag_ids = batch_tag_ids[i] |             doc_tag_ids = batch_tag_ids[i] | ||||||
|             if hasattr(doc_tag_ids, 'get'): |             if hasattr(doc_tag_ids, 'get'): | ||||||
|  | @ -359,7 +377,7 @@ class Tagger(Pipe): | ||||||
|             for j, tag_id in enumerate(doc_tag_ids): |             for j, tag_id in enumerate(doc_tag_ids): | ||||||
|                 # Don't clobber preset POS tags |                 # Don't clobber preset POS tags | ||||||
|                 if doc.c[j].tag == 0 and doc.c[j].pos == 0: |                 if doc.c[j].tag == 0 and doc.c[j].pos == 0: | ||||||
|                     vocab.morphology.assign_tag_id(&doc.c[j], tag_id) |                     vocab.morphology.assign_tag(&doc.c[j], tags[tag_id]) | ||||||
|                 idx += 1 |                 idx += 1 | ||||||
|         doc.is_tagged = True |         doc.is_tagged = True | ||||||
| 
 | 
 | ||||||
|  | @ -420,6 +438,17 @@ class Tagger(Pipe): | ||||||
|     def Model(cls, n_tags, **cfg): |     def Model(cls, n_tags, **cfg): | ||||||
|         return build_tagger_model(n_tags, **cfg) |         return build_tagger_model(n_tags, **cfg) | ||||||
| 
 | 
 | ||||||
|  |     def add_label(self, label): | ||||||
|  |         if label in self.labels: | ||||||
|  |             return 0 | ||||||
|  |         smaller = self.model[-1]._layers[-1] | ||||||
|  |         larger = Softmax(len(self.labels)+1, smaller.nI) | ||||||
|  |         copy_array(larger.W[:smaller.nO], smaller.W) | ||||||
|  |         copy_array(larger.b[:smaller.nO], smaller.b) | ||||||
|  |         self.model[-1]._layers[-1] = larger | ||||||
|  |         self.labels.append(label) | ||||||
|  |         return 1 | ||||||
|  | 
 | ||||||
|     def use_params(self, params): |     def use_params(self, params): | ||||||
|         with self.model.use_params(params): |         with self.model.use_params(params): | ||||||
|             yield |             yield | ||||||
|  | @ -675,7 +704,7 @@ class TextCategorizer(Pipe): | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def labels(self): |     def labels(self): | ||||||
|         return self.cfg.get('labels', ['LABEL']) |         return self.cfg.setdefault('labels', ['LABEL']) | ||||||
| 
 | 
 | ||||||
|     @labels.setter |     @labels.setter | ||||||
|     def labels(self, value): |     def labels(self, value): | ||||||
|  | @ -727,6 +756,17 @@ class TextCategorizer(Pipe): | ||||||
|         mean_square_error = ((scores-truths)**2).sum(axis=1).mean() |         mean_square_error = ((scores-truths)**2).sum(axis=1).mean() | ||||||
|         return mean_square_error, d_scores |         return mean_square_error, d_scores | ||||||
| 
 | 
 | ||||||
|  |     def add_label(self, label): | ||||||
|  |         if label in self.labels: | ||||||
|  |             return 0 | ||||||
|  |         smaller = self.model[-1]._layers[-1] | ||||||
|  |         larger = Affine(len(self.labels)+1, smaller.nI) | ||||||
|  |         copy_array(larger.W[:smaller.nO], smaller.W) | ||||||
|  |         copy_array(larger.b[:smaller.nO], smaller.b) | ||||||
|  |         self.model[-1]._layers[-1] = larger | ||||||
|  |         self.labels.append(label) | ||||||
|  |         return 1 | ||||||
|  | 
 | ||||||
|     def begin_training(self, gold_tuples=tuple(), pipeline=None): |     def begin_training(self, gold_tuples=tuple(), pipeline=None): | ||||||
|         if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': |         if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': | ||||||
|             token_vector_width = pipeline[0].model.nO |             token_vector_width = pipeline[0].model.nO | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user