mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						0a27afbf86
					
				|  | @ -35,21 +35,24 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, | |||
|     texts, _ = zip(*data[-limit:]) | ||||
|     print("Processing texts...") | ||||
|     partitions = partition_all(batch_size, texts) | ||||
|     items = ((i, [nlp(text) for text in texts], output_dir) for i, texts | ||||
|              in enumerate(partitions)) | ||||
|     Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items) | ||||
|     executor = Parallel(n_jobs=n_jobs) | ||||
|     do = delayed(transform_texts) | ||||
|     tasks = (do(nlp, i, batch, output_dir) | ||||
|              for i, batch in enumerate(partitions)) | ||||
|     executor(tasks) | ||||
| 
 | ||||
| 
 | ||||
| def transform_texts(batch_id, docs, output_dir): | ||||
| def transform_texts(nlp, batch_id, texts, output_dir): | ||||
|     print(nlp.pipe_names) | ||||
|     out_path = Path(output_dir) / ('%d.txt' % batch_id) | ||||
|     if out_path.exists():  # return None in case same batch is called again | ||||
|         return None | ||||
|     print('Processing batch', batch_id) | ||||
|     with out_path.open('w', encoding='utf8') as f: | ||||
|         for doc in docs: | ||||
|         for doc in nlp.pipe(texts): | ||||
|             f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) | ||||
|             f.write('\n') | ||||
|     print('Saved {} texts to {}.txt'.format(len(docs), batch_id)) | ||||
|     print('Saved {} texts to {}.txt'.format(len(texts), batch_id)) | ||||
| 
 | ||||
| 
 | ||||
| def represent_word(word): | ||||
|  |  | |||
|  | @ -135,10 +135,6 @@ class Language(object): | |||
|         self.pipeline = [] | ||||
|         self._optimizer = None | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         bytes_data = self.to_bytes(vocab=False) | ||||
|         return (unpickle_language, (self.vocab, self.meta, bytes_data)) | ||||
| 
 | ||||
|     @property | ||||
|     def path(self): | ||||
|         return self._path | ||||
|  | @ -724,12 +720,6 @@ class DisabledPipes(list): | |||
|         self[:] = [] | ||||
| 
 | ||||
| 
 | ||||
| def unpickle_language(vocab, meta, bytes_data): | ||||
|     lang = Language(vocab=vocab) | ||||
|     lang.from_bytes(bytes_data) | ||||
|     return lang | ||||
| 
 | ||||
| 
 | ||||
| def _pipe(func, docs): | ||||
|     for doc in docs: | ||||
|         func(doc) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user