* Fix efficiency of JSON reading, by using ujson instead of stream

2025-11-03 01:17:52 +03:00 · 2015-05-30 17:54:52 +02:00 · 2015-05-30 17:54:52 +02:00 · 9e39a206da
commit 9e39a206da
parent 6bbdcc5db5
1 changed files with 25 additions and 21 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -2,6 +2,7 @@ import numpy
 import codecs
 import json
 import ijson
+import ujson
 import random
 import re
 import os
@ -96,12 +97,14 @@ def _min_edit_path(cand_words, gold_words):


 def read_json_file(loc):
+    print loc
    if path.isdir(loc):
        for filename in os.listdir(loc):
            yield from read_json_file(path.join(loc, filename))
    else:
        with open(loc) as file_:
-            for doc in ijson.items(file_, 'item'):
+            docs = ujson.load(file_)
+        for doc in docs:
            paragraphs = []
            for paragraph in doc['paragraphs']:
                sents = []
@ -122,6 +125,7 @@ def read_json_file(loc):
                    sents.append((
                        (ids, words, tags, heads, labels, ner),
                        sent.get('brackets', [])))
+                if sents:
                    yield (paragraph.get('raw', None), sents)