From 6ea981c83904bd082d4a37d75dd02fe86b615b12 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Aug 2018 14:04:32 +0200 Subject: [PATCH] Add converter for jsonl NER data --- spacy/cli/_messages.py | 1 + spacy/cli/convert.py | 8 +++++-- spacy/cli/converters/__init__.py | 1 + spacy/cli/converters/conll_ner2json.py | 2 +- spacy/cli/converters/conllu2json.py | 2 +- spacy/cli/converters/conllubio2json.py | 2 +- spacy/cli/converters/jsonl2json.py | 33 ++++++++++++++++++++++++++ 7 files changed, 44 insertions(+), 5 deletions(-) create mode 100644 spacy/cli/converters/jsonl2json.py diff --git a/spacy/cli/_messages.py b/spacy/cli/_messages.py index 88dcb1b35..01ec9dbf6 100644 --- a/spacy/cli/_messages.py +++ b/spacy/cli/_messages.py @@ -72,3 +72,4 @@ class Messages(object): M051 = ("Development data not found") M052 = ("Not a valid meta.json format") M053 = ("Expected dict but got: {meta_type}") + M054 = ("No --lang specified, but tokenization required.") diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 1b6217a63..cfbb9e56a 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -5,6 +5,7 @@ import plac from pathlib import Path from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json +from .converters import ner_jsonl2json from ._messages import Messages from ..util import prints @@ -17,6 +18,7 @@ CONVERTERS = { 'conll': conllu2json, 'ner': conll_ner2json, 'iob': iob2json, + 'jsonl': ner_jsonl2json } @@ -25,8 +27,10 @@ CONVERTERS = { output_dir=("output directory for converted file", "positional", None, str), n_sents=("Number of sentences per doc", "option", "n", int), converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), + lang=("Language (if tokenizer required)", "option", "l", str), morphology=("Enable appending morphology to tags", "flag", "m", bool)) -def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto'): +def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto', + lang=None): """ Convert files into JSON format for use with train command and other experiment management functions. @@ -44,4 +48,4 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto title=Messages.M030, exits=1) func = CONVERTERS[converter] func(input_path, output_path, - n_sents=n_sents, use_morphology=morphology) + n_sents=n_sents, use_morphology=morphology, lang=lang) diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index c7d5ac198..c6898fa98 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -2,3 +2,4 @@ from .conllu2json import conllu2json from .conllubio2json import conllubio2json from .iob2json import iob2json from .conll_ner2json import conll_ner2json +from .jsonl2json import ner_jsonl2json diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index dd4f4d394..f53261488 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -7,7 +7,7 @@ from ...util import prints from ...gold import iob_to_biluo -def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): +def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): """ Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index d4f73cff2..e26020ad2 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -8,7 +8,7 @@ from ...gold import iob_to_biluo import re -def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): +def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): """ Convert conllu files into JSON format for use with train cli. diff --git a/spacy/cli/converters/conllubio2json.py b/spacy/cli/converters/conllubio2json.py index c671e2fed..881b8c533 100644 --- a/spacy/cli/converters/conllubio2json.py +++ b/spacy/cli/converters/conllubio2json.py @@ -5,7 +5,7 @@ from ...compat import json_dumps, path2str from ...util import prints from ...gold import iob_to_biluo -def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False): +def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): """ Convert conllu files into JSON format for use with train cli. use_morphology parameter enables appending morphology to tags, which is diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py new file mode 100644 index 000000000..3508a05cd --- /dev/null +++ b/spacy/cli/converters/jsonl2json.py @@ -0,0 +1,33 @@ +# coding: utf8 +from __future__ import unicode_literals +import ujson as json + +from .._messages import Messages +from ...compat import json_dumps, path2str +from ...util import prints, get_lang_class +from ...gold import docs_to_json + + +def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False): + if lang is None: + prints(Messages.M054, exits=True) + json_docs = [] + input_tuples = list(read_jsonl(input_path)) + nlp = get_lang_class(lang)() + for i, (raw_text, ents) in enumerate(input_tuples): + doc = nlp.make_doc(raw_text) + doc[0].is_sent_start = True + doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']] + json_docs.append(docs_to_json(i, [doc])) + + output_filename = input_path.parts[-1].replace(".jsonl", ".json") + output_loc = output_path / output_filename + with (output_loc).open('w', encoding='utf8') as file_: + file_.write(json_dumps(json_docs)) + prints(Messages.M033.format(n_docs=len(json_docs)), + title=Messages.M032.format(name=path2str(output_loc))) + +def read_jsonl(input_path): + with input_path.open('r', encoding='utf8') as file_: + for line in file_: + yield json.loads(line)