From 7a2725bca4131330e0941ccd808448d52c7f3f9f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 May 2015 19:13:11 +0200 Subject: [PATCH] * Read input json in a streaming way --- spacy/gold.pyx | 53 ++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 78782eda4..0bc2d1f72 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,6 +1,7 @@ import numpy import codecs import json +import ijson import random import re @@ -38,11 +39,13 @@ def _min_edit_path(cand_words, gold_words): # TODO: Fix this --- just do it properly, make the full edit matrix and # then walk back over it... - mem = Pool() # Preprocess inputs cand_words = [punct_re.sub('', w) for w in cand_words] gold_words = [punct_re.sub('', w) for w in gold_words] - + + if cand_words == gold_words: + return 0, ['M' for _ in gold_words] + mem = Pool() n_cand = len(cand_words) n_gold = len(gold_words) # Levenshtein distance, except we need the history, and we may want different @@ -89,30 +92,30 @@ def _min_edit_path(cand_words, gold_words): return prev_costs[n_gold], previous_row[-1] -def read_json_file(loc): - paragraphs = [] - for doc in json.load(open(loc)): - for paragraph in doc['paragraphs']: - words = [] - ids = [] - tags = [] - heads = [] - labels = [] - ner = [] - for token in paragraph['tokens']: - words.append(token['orth']) - ids.append(token['id']) - tags.append(token['tag']) - heads.append(token['head'] if token['head'] >= 0 else token['id']) - labels.append(token['dep']) - ner.append(token.get('ner', '-')) - brackets = [] - paragraphs.append(( - paragraph['raw'], - (ids, words, tags, heads, labels, ner), - paragraph.get('brackets', []))) - return paragraphs +def read_json_file(loc): + with open(loc) as file_: + for doc in ijson.items(file_, 'item'): + paragraphs = [] + for paragraph in doc['paragraphs']: + words = [] + ids = [] + tags = [] + heads = [] + labels = [] + ner = [] + for token in paragraph['tokens']: + words.append(token['orth']) + ids.append(token['id']) + tags.append(token['tag']) + heads.append(token['head'] if token['head'] >= 0 else token['id']) + labels.append(token['dep']) + ner.append(token.get('ner', '-')) + + yield ( + paragraph.get('raw', None), + (ids, words, tags, heads, labels, ner), + paragraph.get('brackets', [])) def _iob_to_biluo(tags):