* Fix efficiency of JSON reading, by using ujson instead of stream

This commit is contained in:
Matthew Honnibal 2015-05-30 17:54:52 +02:00
parent 6bbdcc5db5
commit 9e39a206da

View File

@ -2,6 +2,7 @@ import numpy
import codecs import codecs
import json import json
import ijson import ijson
import ujson
import random import random
import re import re
import os import os
@ -96,12 +97,14 @@ def _min_edit_path(cand_words, gold_words):
def read_json_file(loc): def read_json_file(loc):
print loc
if path.isdir(loc): if path.isdir(loc):
for filename in os.listdir(loc): for filename in os.listdir(loc):
yield from read_json_file(path.join(loc, filename)) yield from read_json_file(path.join(loc, filename))
else: else:
with open(loc) as file_: with open(loc) as file_:
for doc in ijson.items(file_, 'item'): docs = ujson.load(file_)
for doc in docs:
paragraphs = [] paragraphs = []
for paragraph in doc['paragraphs']: for paragraph in doc['paragraphs']:
sents = [] sents = []
@ -122,6 +125,7 @@ def read_json_file(loc):
sents.append(( sents.append((
(ids, words, tags, heads, labels, ner), (ids, words, tags, heads, labels, ner),
sent.get('brackets', []))) sent.get('brackets', [])))
if sents:
yield (paragraph.get('raw', None), sents) yield (paragraph.get('raw', None), sents)