mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-29 23:17:59 +03:00 
			
		
		
		
	Stream the gold data during training, to reduce memory
This commit is contained in:
		
							parent
							
								
									7c0823ad76
								
							
						
					
					
						commit
						c3d168509a
					
				|  | @ -13,7 +13,7 @@ from . import _align | ||||||
| from .syntax import nonproj | from .syntax import nonproj | ||||||
| from .tokens import Doc | from .tokens import Doc | ||||||
| from . import util | from . import util | ||||||
| from .util import minibatch | from .util import minibatch, itershuffle | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def tags_to_entities(tags): | def tags_to_entities(tags): | ||||||
|  | @ -133,15 +133,14 @@ class GoldCorpus(object): | ||||||
|     def train_docs(self, nlp, gold_preproc=False, |     def train_docs(self, nlp, gold_preproc=False, | ||||||
|                    projectivize=False, max_length=None, |                    projectivize=False, max_length=None, | ||||||
|                    noise_level=0.0): |                    noise_level=0.0): | ||||||
|         train_tuples = list(self.train_tuples) |  | ||||||
|         if projectivize: |         if projectivize: | ||||||
|             train_tuples = nonproj.preprocess_training_data( |             train_tuples = nonproj.preprocess_training_data( | ||||||
|                 self.train_tuples, label_freq_cutoff=30) |                 self.train_tuples, label_freq_cutoff=30) | ||||||
|         random.shuffle(train_tuples) |         random.shuffle(self.train_locs) | ||||||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, |         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, | ||||||
|                                         max_length=max_length, |                                         max_length=max_length, | ||||||
|                                         noise_level=noise_level) |                                         noise_level=noise_level) | ||||||
|         yield from gold_docs |         yield from itershuffle(gold_docs, bufsize=100) | ||||||
| 
 | 
 | ||||||
|     def dev_docs(self, nlp, gold_preproc=False): |     def dev_docs(self, nlp, gold_preproc=False): | ||||||
|         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) |         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user