mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 15:37:29 +03:00 
			
		
		
		
	Support sentence limits in GoldCorpus
This commit is contained in:
		
							parent
							
								
									e2136232f9
								
							
						
					
					
						commit
						c9760b2104
					
				|  | @ -144,7 +144,7 @@ def _min_edit_path(cand_words, gold_words): | |||
| class GoldCorpus(object): | ||||
|     """An annotated corpus, using the JSON file format. Manages | ||||
|     annotations for tagging, dependency parsing and NER.""" | ||||
|     def __init__(self, train_path, dev_path): | ||||
|     def __init__(self, train_path, dev_path, limit=None): | ||||
|         """Create a GoldCorpus. | ||||
| 
 | ||||
|         train_path (unicode or Path): File or directory of training data. | ||||
|  | @ -152,20 +152,31 @@ class GoldCorpus(object): | |||
|         """ | ||||
|         self.train_path = util.ensure_path(train_path) | ||||
|         self.dev_path = util.ensure_path(dev_path) | ||||
|         self.limit = limit | ||||
|         self.train_locs = self.walk_corpus(self.train_path) | ||||
|         self.dev_locs = self.walk_corpus(self.dev_path) | ||||
| 
 | ||||
|     @property | ||||
|     def train_tuples(self): | ||||
|         i = 0 | ||||
|         for loc in self.train_locs: | ||||
|             gold_tuples = read_json_file(loc) | ||||
|             yield from gold_tuples | ||||
|             for item in gold_tuples: | ||||
|                 yield item | ||||
|                 i += 1 | ||||
|                 if self.limit and i >= self.limit: | ||||
|                     break | ||||
| 
 | ||||
|     @property | ||||
|     def dev_tuples(self): | ||||
|         i = 0 | ||||
|         for loc in self.dev_locs: | ||||
|             gold_tuples = read_json_file(loc) | ||||
|             yield from gold_tuples | ||||
|             for item in gold_tuples: | ||||
|                 yield item | ||||
|                 i += 1 | ||||
|                 if self.limit and i >= self.limit: | ||||
|                     break | ||||
| 
 | ||||
|     def count_train(self): | ||||
|         n = 0 | ||||
|  | @ -175,8 +186,7 @@ class GoldCorpus(object): | |||
| 
 | ||||
|     def train_docs(self, nlp, shuffle=0, gold_preproc=True, | ||||
|                    projectivize=False): | ||||
|         if shuffle: | ||||
|             random.shuffle(self.train_locs) | ||||
|         train_tuples = self.train_tuples | ||||
|         if projectivize: | ||||
|             train_tuples = nonproj.preprocess_training_data( | ||||
|                                self.train_tuples) | ||||
|  | @ -185,13 +195,13 @@ class GoldCorpus(object): | |||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) | ||||
|         yield from gold_docs | ||||
| 
 | ||||
|     def dev_docs(self, nlp): | ||||
|         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples) | ||||
|     def dev_docs(self, nlp, gold_preproc=True): | ||||
|         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) | ||||
|         gold_docs = nlp.preprocess_gold(gold_docs) | ||||
|         yield from gold_docs | ||||
| 
 | ||||
|     @classmethod | ||||
|     def iter_gold_docs(cls, nlp, tuples, gold_preproc=True): | ||||
|     def iter_gold_docs(cls, nlp, tuples, gold_preproc): | ||||
|         for raw_text, paragraph_tuples in tuples: | ||||
|             docs = cls._make_docs(nlp, raw_text, paragraph_tuples, | ||||
|                                   gold_preproc) | ||||
|  | @ -275,7 +285,7 @@ def read_json_file(loc, docs_filter=None, limit=None): | |||
|                         ner.append(token.get('ner', '-')) | ||||
|                     sents.append([ | ||||
|                         [ids, words, tags, heads, labels, ner], | ||||
|                         sent.get('brackets', [])]) | ||||
|                          sent.get('brackets', [])]) | ||||
|                 if sents: | ||||
|                     yield [paragraph.get('raw', None), sents] | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user