mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Add converter for jsonl NER data
This commit is contained in:
parent
a9fb6d5511
commit
6ea981c839
|
@ -72,3 +72,4 @@ class Messages(object):
|
||||||
M051 = ("Development data not found")
|
M051 = ("Development data not found")
|
||||||
M052 = ("Not a valid meta.json format")
|
M052 = ("Not a valid meta.json format")
|
||||||
M053 = ("Expected dict but got: {meta_type}")
|
M053 = ("Expected dict but got: {meta_type}")
|
||||||
|
M054 = ("No --lang specified, but tokenization required.")
|
||||||
|
|
|
@ -5,6 +5,7 @@ import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
||||||
|
from .converters import ner_jsonl2json
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
|
|
||||||
|
@ -17,6 +18,7 @@ CONVERTERS = {
|
||||||
'conll': conllu2json,
|
'conll': conllu2json,
|
||||||
'ner': conll_ner2json,
|
'ner': conll_ner2json,
|
||||||
'iob': iob2json,
|
'iob': iob2json,
|
||||||
|
'jsonl': ner_jsonl2json
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,8 +27,10 @@ CONVERTERS = {
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("output directory for converted file", "positional", None, str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||||
|
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
||||||
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto'):
|
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
|
||||||
|
lang=None):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions.
|
||||||
|
@ -44,4 +48,4 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto
|
||||||
title=Messages.M030, exits=1)
|
title=Messages.M030, exits=1)
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
func(input_path, output_path,
|
func(input_path, output_path,
|
||||||
n_sents=n_sents, use_morphology=morphology)
|
n_sents=n_sents, use_morphology=morphology, lang=lang)
|
||||||
|
|
|
@ -2,3 +2,4 @@ from .conllu2json import conllu2json
|
||||||
from .conllubio2json import conllubio2json
|
from .conllubio2json import conllubio2json
|
||||||
from .iob2json import iob2json
|
from .iob2json import iob2json
|
||||||
from .conll_ner2json import conll_ner2json
|
from .conll_ner2json import conll_ner2json
|
||||||
|
from .jsonl2json import ner_jsonl2json
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ...util import prints
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||||
"""
|
"""
|
||||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||||
train cli.
|
train cli.
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ...gold import iob_to_biluo
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Convert conllu files into JSON format for use with train cli.
|
Convert conllu files into JSON format for use with train cli.
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...compat import json_dumps, path2str
|
||||||
from ...util import prints
|
from ...util import prints
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||||
"""
|
"""
|
||||||
Convert conllu files into JSON format for use with train cli.
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
|
|
33
spacy/cli/converters/jsonl2json.py
Normal file
33
spacy/cli/converters/jsonl2json.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import ujson as json
|
||||||
|
|
||||||
|
from .._messages import Messages
|
||||||
|
from ...compat import json_dumps, path2str
|
||||||
|
from ...util import prints, get_lang_class
|
||||||
|
from ...gold import docs_to_json
|
||||||
|
|
||||||
|
|
||||||
|
def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
|
||||||
|
if lang is None:
|
||||||
|
prints(Messages.M054, exits=True)
|
||||||
|
json_docs = []
|
||||||
|
input_tuples = list(read_jsonl(input_path))
|
||||||
|
nlp = get_lang_class(lang)()
|
||||||
|
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
doc[0].is_sent_start = True
|
||||||
|
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
|
||||||
|
json_docs.append(docs_to_json(i, [doc]))
|
||||||
|
|
||||||
|
output_filename = input_path.parts[-1].replace(".jsonl", ".json")
|
||||||
|
output_loc = output_path / output_filename
|
||||||
|
with (output_loc).open('w', encoding='utf8') as file_:
|
||||||
|
file_.write(json_dumps(json_docs))
|
||||||
|
prints(Messages.M033.format(n_docs=len(json_docs)),
|
||||||
|
title=Messages.M032.format(name=path2str(output_loc)))
|
||||||
|
|
||||||
|
def read_jsonl(input_path):
|
||||||
|
with input_path.open('r', encoding='utf8') as file_:
|
||||||
|
for line in file_:
|
||||||
|
yield json.loads(line)
|
Loading…
Reference in New Issue
Block a user