mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 20:51:30 +03:00
* Read input json in a streaming way
This commit is contained in:
parent
b7fd77779a
commit
7a2725bca4
|
@ -1,6 +1,7 @@
|
||||||
import numpy
|
import numpy
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
|
import ijson
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
@ -38,11 +39,13 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
# TODO: Fix this --- just do it properly, make the full edit matrix and
|
# TODO: Fix this --- just do it properly, make the full edit matrix and
|
||||||
# then walk back over it...
|
# then walk back over it...
|
||||||
mem = Pool()
|
|
||||||
# Preprocess inputs
|
# Preprocess inputs
|
||||||
cand_words = [punct_re.sub('', w) for w in cand_words]
|
cand_words = [punct_re.sub('', w) for w in cand_words]
|
||||||
gold_words = [punct_re.sub('', w) for w in gold_words]
|
gold_words = [punct_re.sub('', w) for w in gold_words]
|
||||||
|
|
||||||
|
if cand_words == gold_words:
|
||||||
|
return 0, ['M' for _ in gold_words]
|
||||||
|
mem = Pool()
|
||||||
n_cand = len(cand_words)
|
n_cand = len(cand_words)
|
||||||
n_gold = len(gold_words)
|
n_gold = len(gold_words)
|
||||||
# Levenshtein distance, except we need the history, and we may want different
|
# Levenshtein distance, except we need the history, and we may want different
|
||||||
|
@ -89,30 +92,30 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
return prev_costs[n_gold], previous_row[-1]
|
return prev_costs[n_gold], previous_row[-1]
|
||||||
|
|
||||||
def read_json_file(loc):
|
|
||||||
paragraphs = []
|
|
||||||
for doc in json.load(open(loc)):
|
|
||||||
for paragraph in doc['paragraphs']:
|
|
||||||
words = []
|
|
||||||
ids = []
|
|
||||||
tags = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
ner = []
|
|
||||||
for token in paragraph['tokens']:
|
|
||||||
words.append(token['orth'])
|
|
||||||
ids.append(token['id'])
|
|
||||||
tags.append(token['tag'])
|
|
||||||
heads.append(token['head'] if token['head'] >= 0 else token['id'])
|
|
||||||
labels.append(token['dep'])
|
|
||||||
ner.append(token.get('ner', '-'))
|
|
||||||
|
|
||||||
brackets = []
|
def read_json_file(loc):
|
||||||
paragraphs.append((
|
with open(loc) as file_:
|
||||||
paragraph['raw'],
|
for doc in ijson.items(file_, 'item'):
|
||||||
(ids, words, tags, heads, labels, ner),
|
paragraphs = []
|
||||||
paragraph.get('brackets', [])))
|
for paragraph in doc['paragraphs']:
|
||||||
return paragraphs
|
words = []
|
||||||
|
ids = []
|
||||||
|
tags = []
|
||||||
|
heads = []
|
||||||
|
labels = []
|
||||||
|
ner = []
|
||||||
|
for token in paragraph['tokens']:
|
||||||
|
words.append(token['orth'])
|
||||||
|
ids.append(token['id'])
|
||||||
|
tags.append(token['tag'])
|
||||||
|
heads.append(token['head'] if token['head'] >= 0 else token['id'])
|
||||||
|
labels.append(token['dep'])
|
||||||
|
ner.append(token.get('ner', '-'))
|
||||||
|
|
||||||
|
yield (
|
||||||
|
paragraph.get('raw', None),
|
||||||
|
(ids, words, tags, heads, labels, ner),
|
||||||
|
paragraph.get('brackets', []))
|
||||||
|
|
||||||
|
|
||||||
def _iob_to_biluo(tags):
|
def _iob_to_biluo(tags):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user