mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Pass option for pretrained vectors in pipeline
This commit is contained in:
		
							parent
							
								
									2a93404da6
								
							
						
					
					
						commit
						84e637e2e6
					
				|  | @ -41,7 +41,7 @@ from .syntax import nonproj | ||||||
| from .compat import json_dumps | from .compat import json_dumps | ||||||
| 
 | 
 | ||||||
| from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS | from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS | ||||||
| from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats | from ._ml import rebatch, Tok2Vec, flatten | ||||||
| from ._ml import build_text_classifier, build_tagger_model | from ._ml import build_text_classifier, build_tagger_model | ||||||
| from .parts_of_speech import X | from .parts_of_speech import X | ||||||
| 
 | 
 | ||||||
|  | @ -137,6 +137,7 @@ class BaseThincComponent(object): | ||||||
|     def from_bytes(self, bytes_data, **exclude): |     def from_bytes(self, bytes_data, **exclude): | ||||||
|         def load_model(b): |         def load_model(b): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|  |                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|                 self.model = self.Model(**self.cfg) |                 self.model = self.Model(**self.cfg) | ||||||
|             self.model.from_bytes(b) |             self.model.from_bytes(b) | ||||||
| 
 | 
 | ||||||
|  | @ -159,6 +160,7 @@ class BaseThincComponent(object): | ||||||
|     def from_disk(self, path, **exclude): |     def from_disk(self, path, **exclude): | ||||||
|         def load_model(p): |         def load_model(p): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|  |                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|                 self.model = self.Model(**self.cfg) |                 self.model = self.Model(**self.cfg) | ||||||
|             self.model.from_bytes(p.open('rb').read()) |             self.model.from_bytes(p.open('rb').read()) | ||||||
| 
 | 
 | ||||||
|  | @ -193,7 +195,7 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         """ |         """ | ||||||
|         width = util.env_opt('token_vector_width', width) |         width = util.env_opt('token_vector_width', width) | ||||||
|         embed_size = util.env_opt('embed_size', embed_size) |         embed_size = util.env_opt('embed_size', embed_size) | ||||||
|         return Tok2Vec(width, embed_size, preprocess=None) |         return Tok2Vec(width, embed_size, **cfg) | ||||||
| 
 | 
 | ||||||
|     def __init__(self, vocab, model=True, **cfg): |     def __init__(self, vocab, model=True, **cfg): | ||||||
|         """Construct a new statistical model. Weights are not allocated on |         """Construct a new statistical model. Weights are not allocated on | ||||||
|  | @ -210,7 +212,6 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|             >>> tok2vec.model = tok2vec.Model(128, 5000) |             >>> tok2vec.model = tok2vec.Model(128, 5000) | ||||||
|         """ |         """ | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self.doc2feats = doc2feats() |  | ||||||
|         self.model = model |         self.model = model | ||||||
|         self.cfg = dict(cfg) |         self.cfg = dict(cfg) | ||||||
| 
 | 
 | ||||||
|  | @ -245,8 +246,7 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         docs (iterable): A sequence of `Doc` objects. |         docs (iterable): A sequence of `Doc` objects. | ||||||
|         RETURNS (object): Vector representations for each token in the documents. |         RETURNS (object): Vector representations for each token in the documents. | ||||||
|         """ |         """ | ||||||
|         feats = self.doc2feats(docs) |         tokvecs = self.model(docs) | ||||||
|         tokvecs = self.model(feats) |  | ||||||
|         return tokvecs |         return tokvecs | ||||||
| 
 | 
 | ||||||
|     def set_annotations(self, docs, tokvecses): |     def set_annotations(self, docs, tokvecses): | ||||||
|  | @ -270,8 +270,7 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         """ |         """ | ||||||
|         if isinstance(docs, Doc): |         if isinstance(docs, Doc): | ||||||
|             docs = [docs] |             docs = [docs] | ||||||
|         feats = self.doc2feats(docs) |         tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop) | ||||||
|         tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop) |  | ||||||
|         return tokvecs, bp_tokvecs |         return tokvecs, bp_tokvecs | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|  | @ -285,9 +284,8 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         gold_tuples (iterable): Gold-standard training data. |         gold_tuples (iterable): Gold-standard training data. | ||||||
|         pipeline (list): The pipeline the model is part of. |         pipeline (list): The pipeline the model is part of. | ||||||
|         """ |         """ | ||||||
|         self.doc2feats = doc2feats() |  | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model() |             self.model = self.Model(**self.cfg) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class NeuralTagger(BaseThincComponent): | class NeuralTagger(BaseThincComponent): | ||||||
|  | @ -394,11 +392,13 @@ class NeuralTagger(BaseThincComponent): | ||||||
|                                           exc=vocab.morphology.exc) |                                           exc=vocab.morphology.exc) | ||||||
|         token_vector_width = pipeline[0].model.nO |         token_vector_width = pipeline[0].model.nO | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) |             self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width, | ||||||
|  |                                     pretrained_dims=self.vocab.vectors_length) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, n_tags, token_vector_width): |     def Model(cls, n_tags, token_vector_width, pretrained_dims=0): | ||||||
|         return build_tagger_model(n_tags, token_vector_width) |         return build_tagger_model(n_tags, token_vector_width, | ||||||
|  |                                   pretrained_dims) | ||||||
| 
 | 
 | ||||||
|     def use_params(self, params): |     def use_params(self, params): | ||||||
|         with self.model.use_params(params): |         with self.model.use_params(params): | ||||||
|  | @ -419,7 +419,8 @@ class NeuralTagger(BaseThincComponent): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|                 token_vector_width = util.env_opt('token_vector_width', |                 token_vector_width = util.env_opt('token_vector_width', | ||||||
|                         self.cfg.get('token_vector_width', 128)) |                         self.cfg.get('token_vector_width', 128)) | ||||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) |                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width, | ||||||
|  |                                         pretrained_dims=self.vocab.vectors_length) | ||||||
|             self.model.from_bytes(b) |             self.model.from_bytes(b) | ||||||
| 
 | 
 | ||||||
|         def load_tag_map(b): |         def load_tag_map(b): | ||||||
|  | @ -454,7 +455,8 @@ class NeuralTagger(BaseThincComponent): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|                 token_vector_width = util.env_opt('token_vector_width', |                 token_vector_width = util.env_opt('token_vector_width', | ||||||
|                         self.cfg.get('token_vector_width', 128)) |                         self.cfg.get('token_vector_width', 128)) | ||||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) |                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width, | ||||||
|  |                                         pretrained_dims=self.vocab.vectors_length) | ||||||
|             self.model.from_bytes(p.open('rb').read()) |             self.model.from_bytes(p.open('rb').read()) | ||||||
| 
 | 
 | ||||||
|         def load_tag_map(p): |         def load_tag_map(p): | ||||||
|  | @ -503,11 +505,13 @@ class NeuralLabeller(NeuralTagger): | ||||||
|                         self.labels[dep] = len(self.labels) |                         self.labels[dep] = len(self.labels) | ||||||
|         token_vector_width = pipeline[0].model.nO |         token_vector_width = pipeline[0].model.nO | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model(len(self.labels), token_vector_width) |             self.model = self.Model(len(self.labels), token_vector_width, | ||||||
|  |                                     pretrained_dims=self.vocab.vectors_length) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, n_tags, token_vector_width): |     def Model(cls, n_tags, token_vector_width, pretrained_dims=0): | ||||||
|         return build_tagger_model(n_tags, token_vector_width) |         return build_tagger_model(n_tags, token_vector_width, | ||||||
|  |                                   pretrained_dims) | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|         scores = self.model.ops.flatten(scores) |         scores = self.model.ops.flatten(scores) | ||||||
|  | @ -653,6 +657,7 @@ class TextCategorizer(BaseThincComponent): | ||||||
|         else: |         else: | ||||||
|             token_vector_width = 64 |             token_vector_width = 64 | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|  |             self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|             self.model = self.Model(len(self.labels), token_vector_width, |             self.model = self.Model(len(self.labels), token_vector_width, | ||||||
|                                     **self.cfg) |                                     **self.cfg) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user