mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Fix efficiency of JSON reading, by using ujson instead of stream
This commit is contained in:
		
							parent
							
								
									6bbdcc5db5
								
							
						
					
					
						commit
						9e39a206da
					
				| 
						 | 
					@ -2,6 +2,7 @@ import numpy
 | 
				
			||||||
import codecs
 | 
					import codecs
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
import ijson
 | 
					import ijson
 | 
				
			||||||
 | 
					import ujson
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
| 
						 | 
					@ -96,32 +97,35 @@ def _min_edit_path(cand_words, gold_words):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_json_file(loc):
 | 
					def read_json_file(loc):
 | 
				
			||||||
 | 
					    print loc
 | 
				
			||||||
    if path.isdir(loc):
 | 
					    if path.isdir(loc):
 | 
				
			||||||
        for filename in os.listdir(loc):
 | 
					        for filename in os.listdir(loc):
 | 
				
			||||||
            yield from read_json_file(path.join(loc, filename))
 | 
					            yield from read_json_file(path.join(loc, filename))
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        with open(loc) as file_:
 | 
					        with open(loc) as file_:
 | 
				
			||||||
            for doc in ijson.items(file_, 'item'):
 | 
					            docs = ujson.load(file_)
 | 
				
			||||||
                paragraphs = []
 | 
					        for doc in docs:
 | 
				
			||||||
                for paragraph in doc['paragraphs']:
 | 
					            paragraphs = []
 | 
				
			||||||
                    sents = []
 | 
					            for paragraph in doc['paragraphs']:
 | 
				
			||||||
                    for sent in paragraph['sentences']:
 | 
					                sents = []
 | 
				
			||||||
                        words = []
 | 
					                for sent in paragraph['sentences']:
 | 
				
			||||||
                        ids = []
 | 
					                    words = []
 | 
				
			||||||
                        tags = []
 | 
					                    ids = []
 | 
				
			||||||
                        heads = []
 | 
					                    tags = []
 | 
				
			||||||
                        labels = []
 | 
					                    heads = []
 | 
				
			||||||
                        ner = []
 | 
					                    labels = []
 | 
				
			||||||
                        for i, token in enumerate(sent['tokens']):
 | 
					                    ner = []
 | 
				
			||||||
                            words.append(token['orth'])
 | 
					                    for i, token in enumerate(sent['tokens']):
 | 
				
			||||||
                            ids.append(i)
 | 
					                        words.append(token['orth'])
 | 
				
			||||||
                            tags.append(token['tag'])
 | 
					                        ids.append(i)
 | 
				
			||||||
                            heads.append(token['head'] + i)
 | 
					                        tags.append(token['tag'])
 | 
				
			||||||
                            labels.append(token['dep'])
 | 
					                        heads.append(token['head'] + i)
 | 
				
			||||||
                            ner.append(token.get('ner', '-'))
 | 
					                        labels.append(token['dep'])
 | 
				
			||||||
                        sents.append((
 | 
					                        ner.append(token.get('ner', '-'))
 | 
				
			||||||
                            (ids, words, tags, heads, labels, ner),
 | 
					                    sents.append((
 | 
				
			||||||
                            sent.get('brackets', [])))
 | 
					                        (ids, words, tags, heads, labels, ner),
 | 
				
			||||||
 | 
					                        sent.get('brackets', [])))
 | 
				
			||||||
 | 
					                if sents:
 | 
				
			||||||
                    yield (paragraph.get('raw', None), sents)
 | 
					                    yield (paragraph.get('raw', None), sents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user