mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Fix efficiency of JSON reading, by using ujson instead of stream
This commit is contained in:
		
							parent
							
								
									6bbdcc5db5
								
							
						
					
					
						commit
						9e39a206da
					
				| 
						 | 
					@ -2,6 +2,7 @@ import numpy
 | 
				
			||||||
import codecs
 | 
					import codecs
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
import ijson
 | 
					import ijson
 | 
				
			||||||
 | 
					import ujson
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
| 
						 | 
					@ -96,12 +97,14 @@ def _min_edit_path(cand_words, gold_words):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_json_file(loc):
 | 
					def read_json_file(loc):
 | 
				
			||||||
 | 
					    print loc
 | 
				
			||||||
    if path.isdir(loc):
 | 
					    if path.isdir(loc):
 | 
				
			||||||
        for filename in os.listdir(loc):
 | 
					        for filename in os.listdir(loc):
 | 
				
			||||||
            yield from read_json_file(path.join(loc, filename))
 | 
					            yield from read_json_file(path.join(loc, filename))
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        with open(loc) as file_:
 | 
					        with open(loc) as file_:
 | 
				
			||||||
            for doc in ijson.items(file_, 'item'):
 | 
					            docs = ujson.load(file_)
 | 
				
			||||||
 | 
					        for doc in docs:
 | 
				
			||||||
            paragraphs = []
 | 
					            paragraphs = []
 | 
				
			||||||
            for paragraph in doc['paragraphs']:
 | 
					            for paragraph in doc['paragraphs']:
 | 
				
			||||||
                sents = []
 | 
					                sents = []
 | 
				
			||||||
| 
						 | 
					@ -122,6 +125,7 @@ def read_json_file(loc):
 | 
				
			||||||
                    sents.append((
 | 
					                    sents.append((
 | 
				
			||||||
                        (ids, words, tags, heads, labels, ner),
 | 
					                        (ids, words, tags, heads, labels, ner),
 | 
				
			||||||
                        sent.get('brackets', [])))
 | 
					                        sent.get('brackets', [])))
 | 
				
			||||||
 | 
					                if sents:
 | 
				
			||||||
                    yield (paragraph.get('raw', None), sents)
 | 
					                    yield (paragraph.get('raw', None), sents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user