* Read json files recursively from a directory, instead of requiring a single .json file

This commit is contained in:
Matthew Honnibal 2015-05-29 03:52:55 +02:00
parent 8f31d3b864
commit b76bbbd12c
2 changed files with 29 additions and 23 deletions

View File

@ -138,8 +138,8 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
@plac.annotations( @plac.annotations(
train_loc=("Location of training json file"), train_loc=("Location of training file or directory"),
dev_loc=("Location of development json file"), dev_loc=("Location of development file or directory"),
corruption_level=("Amount of noise to add to training data", "option", "c", float), corruption_level=("Amount of noise to add to training data", "option", "c", float),
model_dir=("Location of output model directory",), model_dir=("Location of output model directory",),
out_loc=("Out location", "option", "o", str), out_loc=("Out location", "option", "o", str),

View File

@ -4,6 +4,8 @@ import json
import ijson import ijson
import random import random
import re import re
import os
from os import path
from spacy.munge.read_ner import tags_to_entities from spacy.munge.read_ner import tags_to_entities
from libc.string cimport memset from libc.string cimport memset
@ -94,28 +96,32 @@ def _min_edit_path(cand_words, gold_words):
def read_json_file(loc): def read_json_file(loc):
with open(loc) as file_: if path.isdir(loc):
for doc in ijson.items(file_, 'item'): for filename in os.listdir(loc):
paragraphs = [] yield from read_json_file(path.join(loc, filename))
for paragraph in doc['paragraphs']: else:
words = [] with open(loc) as file_:
ids = [] for doc in ijson.items(file_, 'item'):
tags = [] paragraphs = []
heads = [] for paragraph in doc['paragraphs']:
labels = [] words = []
ner = [] ids = []
for token in paragraph['tokens']: tags = []
words.append(token['orth']) heads = []
ids.append(token['id']) labels = []
tags.append(token['tag']) ner = []
heads.append(token['head'] if token['head'] >= 0 else token['id']) for token in paragraph['tokens']:
labels.append(token['dep']) words.append(token['orth'])
ner.append(token.get('ner', '-')) ids.append(token['id'])
tags.append(token['tag'])
heads.append(token['head'] if token['head'] >= 0 else token['id'])
labels.append(token['dep'])
ner.append(token.get('ner', '-'))
yield ( yield (
paragraph.get('raw', None), paragraph.get('raw', None),
(ids, words, tags, heads, labels, ner), (ids, words, tags, heads, labels, ner),
paragraph.get('brackets', [])) paragraph.get('brackets', []))
def _iob_to_biluo(tags): def _iob_to_biluo(tags):