Add converter for jsonl NER data

2025-10-28 06:31:12 +03:00 · 2018-08-14 14:04:32 +02:00 · 2018-08-14 14:04:32 +02:00 · 6ea981c839
commit 6ea981c839
parent a9fb6d5511
7 changed files with 44 additions and 5 deletions
--- a/spacy/cli/_messages.py
+++ b/spacy/cli/_messages.py
@ -72,3 +72,4 @@ class Messages(object):
    M051 = ("Development data not found")
    M052 = ("Not a valid meta.json format")
    M053 = ("Expected dict but got: {meta_type}")
+    M054 = ("No --lang specified, but tokenization required.")
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -5,6 +5,7 @@ import plac
 from pathlib import Path

 from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
+from .converters import ner_jsonl2json
 from ._messages import Messages
 from ..util import prints

@ -17,6 +18,7 @@ CONVERTERS = {
    'conll': conllu2json,
    'ner': conll_ner2json,
    'iob': iob2json,
+    'jsonl': ner_jsonl2json
 }


@ -25,8 +27,10 @@ CONVERTERS = {
    output_dir=("output directory for converted file", "positional", None, str),
    n_sents=("Number of sentences per doc", "option", "n", int),
    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
+    lang=("Language (if tokenizer required)", "option", "l", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool))
-def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto'):
+def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
+        lang=None):
    """
    Convert files into JSON format for use with train command and other
    experiment management functions.
@ -44,4 +48,4 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto
                   title=Messages.M030, exits=1)
    func = CONVERTERS[converter]
    func(input_path, output_path,
-         n_sents=n_sents, use_morphology=morphology)
+         n_sents=n_sents, use_morphology=morphology, lang=lang)
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -2,3 +2,4 @@ from .conllu2json import conllu2json
 from .conllubio2json import conllubio2json
 from .iob2json import iob2json
 from .conll_ner2json import conll_ner2json
+from .jsonl2json import ner_jsonl2json
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -7,7 +7,7 @@ from ...util import prints
 from ...gold import iob_to_biluo


-def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
+def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
    """
    Convert files in the CoNLL-2003 NER format into JSON format for use with
    train cli.
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -8,7 +8,7 @@ from ...gold import iob_to_biluo
 import re


-def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
+def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):

    """
    Convert conllu files into JSON format for use with train cli.
--- a/spacy/cli/converters/conllubio2json.py
+++ b/spacy/cli/converters/conllubio2json.py
@ -5,7 +5,7 @@ from ...compat import json_dumps, path2str
 from ...util import prints
 from ...gold import iob_to_biluo

-def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False):
+def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
    """
    Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -0,0 +1,33 @@
+# coding: utf8
+from __future__ import unicode_literals
+import ujson as json
+
+from .._messages import Messages
+from ...compat import json_dumps, path2str
+from ...util import prints, get_lang_class
+from ...gold import docs_to_json
+
+
+def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
+    if lang is None:
+        prints(Messages.M054, exits=True)
+    json_docs = []
+    input_tuples = list(read_jsonl(input_path))
+    nlp = get_lang_class(lang)()
+    for i, (raw_text, ents) in enumerate(input_tuples):
+        doc = nlp.make_doc(raw_text)
+        doc[0].is_sent_start = True
+        doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
+        json_docs.append(docs_to_json(i, [doc]))
+
+    output_filename = input_path.parts[-1].replace(".jsonl", ".json")
+    output_loc = output_path / output_filename
+    with (output_loc).open('w', encoding='utf8') as file_:
+        file_.write(json_dumps(json_docs))
+    prints(Messages.M033.format(n_docs=len(json_docs)),
+           title=Messages.M032.format(name=path2str(output_loc)))
+
+def read_jsonl(input_path):
+    with input_path.open('r', encoding='utf8') as file_:
+        for line in file_:
+            yield json.loads(line)