From f942903429b33b920c18ed7f9c4fe4715733d55f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Oct 2017 17:02:10 +0200 Subject: [PATCH] Improve sentence merging in iob2json --- spacy/cli/converters/iob2json.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 4d456fa57..74bc22ada 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -12,17 +12,13 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): Convert IOB files into JSON format for use with train cli. """ with input_path.open('r', encoding='utf8') as file_: - if n_sents: - lines = [' '.join(para) for para in partition_all(n_sents, file_)] - else: - lines = file_ - sentences = read_iob(lines) - + sentences = read_iob(file_) + docs = merge_sentences(sentences, n_sents) output_filename = input_path.parts[-1].replace(".iob", ".json") output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: - f.write(json_dumps(sentences)) - prints("Created %d documents" % len(sentences), + f.write(json_dumps(docs)) + prints("Created %d documents" % len(docs), title="Generated output file %s" % path2str(output_file)) @@ -46,3 +42,15 @@ def read_iob(raw_sents): paragraphs = [{'sentences': [sent]} for sent in sentences] docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] return docs + +def merge_sentences(docs, n_sents): + counter = 0 + merged = [] + for group in partition_all(n_sents, docs): + group = list(group) + first = group.pop(0) + to_extend = first['paragraphs'][0]['sentences'] + for sent in group[1:]: + to_extend.extend(sent['paragraphs'][0]['sentences']) + merged.append(first) + return merged