* Fix efficiency of JSON reading, by using ujson instead of stream

This commit is contained in:
Matthew Honnibal 2015-05-30 17:54:52 +02:00
parent 6bbdcc5db5
commit 9e39a206da

View File

@ -2,6 +2,7 @@ import numpy
import codecs import codecs
import json import json
import ijson import ijson
import ujson
import random import random
import re import re
import os import os
@ -96,32 +97,35 @@ def _min_edit_path(cand_words, gold_words):
def read_json_file(loc): def read_json_file(loc):
print loc
if path.isdir(loc): if path.isdir(loc):
for filename in os.listdir(loc): for filename in os.listdir(loc):
yield from read_json_file(path.join(loc, filename)) yield from read_json_file(path.join(loc, filename))
else: else:
with open(loc) as file_: with open(loc) as file_:
for doc in ijson.items(file_, 'item'): docs = ujson.load(file_)
paragraphs = [] for doc in docs:
for paragraph in doc['paragraphs']: paragraphs = []
sents = [] for paragraph in doc['paragraphs']:
for sent in paragraph['sentences']: sents = []
words = [] for sent in paragraph['sentences']:
ids = [] words = []
tags = [] ids = []
heads = [] tags = []
labels = [] heads = []
ner = [] labels = []
for i, token in enumerate(sent['tokens']): ner = []
words.append(token['orth']) for i, token in enumerate(sent['tokens']):
ids.append(i) words.append(token['orth'])
tags.append(token['tag']) ids.append(i)
heads.append(token['head'] + i) tags.append(token['tag'])
labels.append(token['dep']) heads.append(token['head'] + i)
ner.append(token.get('ner', '-')) labels.append(token['dep'])
sents.append(( ner.append(token.get('ner', '-'))
(ids, words, tags, heads, labels, ner), sents.append((
sent.get('brackets', []))) (ids, words, tags, heads, labels, ner),
sent.get('brackets', [])))
if sents:
yield (paragraph.get('raw', None), sents) yield (paragraph.get('raw', None), sents)