* Read json files recursively from a directory, instead of requiring a single .json file

2025-11-07 19:37:38 +03:00 · 2015-05-29 03:52:55 +02:00 · 2015-05-29 03:52:55 +02:00 · b76bbbd12c
commit b76bbbd12c
parent 8f31d3b864
2 changed files with 29 additions and 23 deletions
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -138,8 +138,8 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
@plac.annotations(
-    train_loc=("Location of training json file"),
+    train_loc=("Location of training file or directory"),
-    dev_loc=("Location of development json file"),
+    dev_loc=("Location of development file or directory"),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    model_dir=("Location of output model directory",),
    out_loc=("Out location", "option", "o", str),
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -4,6 +4,8 @@ import json
 import ijson
 import random
 import re
 import os
 from os import path
 from spacy.munge.read_ner import tags_to_entities
 from libc.string cimport memset
@ -94,28 +96,32 @@ def _min_edit_path(cand_words, gold_words):
 def read_json_file(loc):
-    with open(loc) as file_:
+    if path.isdir(loc):
-        for doc in ijson.items(file_, 'item'):
+        for filename in os.listdir(loc):
-            paragraphs = []
+            yield from read_json_file(path.join(loc, filename))
-            for paragraph in doc['paragraphs']:
+    else:
-                words = []
+        with open(loc) as file_:
-                ids = []
+            for doc in ijson.items(file_, 'item'):
-                tags = []
+                paragraphs = []
-                heads = []
+                for paragraph in doc['paragraphs']:
-                labels = []
+                    words = []
-                ner = []
+                    ids = []
-                for token in paragraph['tokens']:
+                    tags = []
-                    words.append(token['orth'])
+                    heads = []
-                    ids.append(token['id'])
+                    labels = []
-                    tags.append(token['tag'])
+                    ner = []
-                    heads.append(token['head'] if token['head'] >= 0 else token['id'])
+                    for token in paragraph['tokens']:
-                    labels.append(token['dep'])
+                        words.append(token['orth'])
-                    ner.append(token.get('ner', '-'))
+                        ids.append(token['id'])
                        tags.append(token['tag'])
                        heads.append(token['head'] if token['head'] >= 0 else token['id'])
                        labels.append(token['dep'])
                        ner.append(token.get('ner', '-'))
-                yield (
+                    yield (
-                    paragraph.get('raw', None),
+                        paragraph.get('raw', None),
-                    (ids, words, tags, heads, labels, ner),
+                        (ids, words, tags, heads, labels, ner),
-                    paragraph.get('brackets', []))
+                        paragraph.get('brackets', []))
 def _iob_to_biluo(tags):