mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Always shuffle gold data, and support length cap
This commit is contained in:
		
							parent
							
								
									d65f99a720
								
							
						
					
					
						commit
						daac3e3573
					
				|  | @ -198,15 +198,15 @@ class GoldCorpus(object): | |||
|             n += 1 | ||||
|         return n | ||||
| 
 | ||||
|     def train_docs(self, nlp, shuffle=0, gold_preproc=False, | ||||
|                    projectivize=False): | ||||
|     def train_docs(self, nlp, gold_preproc=False, | ||||
|                    projectivize=False, max_length=None): | ||||
|         train_tuples = self.train_tuples | ||||
|         if projectivize: | ||||
|             train_tuples = nonproj.preprocess_training_data( | ||||
|                                self.train_tuples) | ||||
|         if shuffle: | ||||
|             random.shuffle(train_tuples) | ||||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) | ||||
|         random.shuffle(train_tuples) | ||||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, | ||||
|                                         max_length=max_length) | ||||
|         yield from gold_docs | ||||
| 
 | ||||
|     def dev_docs(self, nlp, gold_preproc=False): | ||||
|  | @ -215,7 +215,7 @@ class GoldCorpus(object): | |||
|         yield from gold_docs | ||||
| 
 | ||||
|     @classmethod | ||||
|     def iter_gold_docs(cls, nlp, tuples, gold_preproc): | ||||
|     def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None): | ||||
|         for raw_text, paragraph_tuples in tuples: | ||||
|             if gold_preproc: | ||||
|                 raw_text = None | ||||
|  | @ -226,7 +226,8 @@ class GoldCorpus(object): | |||
|                                   gold_preproc) | ||||
|             golds = cls._make_golds(docs, paragraph_tuples) | ||||
|             for doc, gold in zip(docs, golds): | ||||
|                 yield doc, gold | ||||
|                 if not max_length or len(doc) < max_length: | ||||
|                     yield doc, gold | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user