* Fix efficiency of JSON reading, by using ujson instead of stream

This commit is contained in:
Matthew Honnibal 2015-05-30 17:54:52 +02:00
parent 6bbdcc5db5
commit 9e39a206da

View File

@ -2,6 +2,7 @@ import numpy
import codecs
import json
import ijson
import ujson
import random
import re
import os
@ -96,12 +97,14 @@ def _min_edit_path(cand_words, gold_words):
def read_json_file(loc):
print loc
if path.isdir(loc):
for filename in os.listdir(loc):
yield from read_json_file(path.join(loc, filename))
else:
with open(loc) as file_:
for doc in ijson.items(file_, 'item'):
docs = ujson.load(file_)
for doc in docs:
paragraphs = []
for paragraph in doc['paragraphs']:
sents = []
@ -122,6 +125,7 @@ def read_json_file(loc):
sents.append((
(ids, words, tags, heads, labels, ner),
sent.get('brackets', [])))
if sents:
yield (paragraph.get('raw', None), sents)