mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-01 14:23:40 +03:00
* Fix efficiency of JSON reading, by using ujson instead of stream
This commit is contained in:
parent
6bbdcc5db5
commit
9e39a206da
|
@ -2,6 +2,7 @@ import numpy
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
import ijson
|
import ijson
|
||||||
|
import ujson
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
@ -96,12 +97,14 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc):
|
def read_json_file(loc):
|
||||||
|
print loc
|
||||||
if path.isdir(loc):
|
if path.isdir(loc):
|
||||||
for filename in os.listdir(loc):
|
for filename in os.listdir(loc):
|
||||||
yield from read_json_file(path.join(loc, filename))
|
yield from read_json_file(path.join(loc, filename))
|
||||||
else:
|
else:
|
||||||
with open(loc) as file_:
|
with open(loc) as file_:
|
||||||
for doc in ijson.items(file_, 'item'):
|
docs = ujson.load(file_)
|
||||||
|
for doc in docs:
|
||||||
paragraphs = []
|
paragraphs = []
|
||||||
for paragraph in doc['paragraphs']:
|
for paragraph in doc['paragraphs']:
|
||||||
sents = []
|
sents = []
|
||||||
|
@ -122,6 +125,7 @@ def read_json_file(loc):
|
||||||
sents.append((
|
sents.append((
|
||||||
(ids, words, tags, heads, labels, ner),
|
(ids, words, tags, heads, labels, ner),
|
||||||
sent.get('brackets', [])))
|
sent.get('brackets', [])))
|
||||||
|
if sents:
|
||||||
yield (paragraph.get('raw', None), sents)
|
yield (paragraph.get('raw', None), sents)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user