Make spacy convert output docbin

This commit is contained in:
Matthew Honnibal 2020-06-20 15:55:35 +02:00
parent 0d22c6e006
commit 7a846921a3

View File

@ -4,8 +4,9 @@ import srsly
import re import re
import sys import sys
from .converters import conllu2json, iob2json, conll_ner2json from ..tokens import DocBin
from .converters import ner_jsonl2json from ..gold.converters import iob2docs, conll_ner2docs, json2docs
from ..gold.converters import ner_jsonl2docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
@ -13,13 +14,13 @@ from .converters import ner_jsonl2json
# entry to this dict with the file extension mapped to the converter function # entry to this dict with the file extension mapped to the converter function
# imported from /converters. # imported from /converters.
DOC_CONVERTERS = { CONVERTERS = {
"conllubio": conllu2doc, #"conllubio": conllu2docs, TODO
"conllu": conllu2doc, #"conllu": conllu2docs, TODO
"conll": conllu2doc, #"conll": conllu2docs, TODO
"ner": conll_ner2doc, "ner": conll_ner2docs,
"iob": iob2doc, "iob": iob2docs,
"jsonl": ner_jsonl2doc, "jsonl": ner_jsonl2docs,
"json": json2docs, "json": json2docs,
} }
@ -42,59 +43,32 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
def convert( def convert(
# fmt: off # fmt: off
input_file: ("Input file", "positional", None, str), input_path: ("Input file or directory", "positional", None, Path),
output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", output_dir: ("Output directory.", "positional", None, Path),
file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False, merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None, lang: ("Language (if tokenizer required)", "option", "l", str) = None,
# fmt: on # fmt: on
): ):
""" """
Convert files into JSON format for use with train command and other Convert files into json or DocBin format for use with train command and other
experiment management functions. If no output_dir is specified, the data experiment management functions.
is written to stdout, so you can pipe them forward to a JSON file:
$ spacy convert some_file.conllu > some_file.json
""" """
cli_args = locals()
no_print = output_dir == "-" no_print = output_dir == "-"
output_dir = Path(output_dir) if output_dir != "-" else "-"
msg = Printer(no_print=no_print) msg = Printer(no_print=no_print)
input_path = Path(input_file) verify_cli_args(msg, **cli_args)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-": converter = _get_converter(msg, converter, input_path)
# TODO: support msgpack via stdout in srsly? ner_map = srsly.read_json(ner_map) if ner_map is not None else None
msg.fail( for input_loc in walk_directory(input_path):
f"Can't write .{file_type} data to stdout", input_data = input_loc.open("r", encoding="utf-8").read()
"Please specify an output directory.",
exits=1,
)
if not input_path.exists():
msg.fail("Input file not found", input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists():
msg.fail("Output directory not found", output_dir, exits=1)
input_data = input_path.open("r", encoding="utf-8").read()
if converter == "auto":
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":
msg.info("Auto-detected token-per-line NER format")
converter = converter_autodetect
elif converter_autodetect == "iob":
msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
)
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
ner_map = None
if ner_map_path is not None:
ner_map = srsly.read_json(ner_map_path)
# Use converter function to convert data # Use converter function to convert data
func = CONVERTERS[converter] func = CONVERTERS[converter]
docs = func( docs = func(
@ -108,27 +82,19 @@ def convert(
no_print=no_print, no_print=no_print,
ner_map=ner_map, ner_map=ner_map,
) )
if write_json: suffix = f".{file_type}"
subpath = input_loc.relative_to(input_path)
output_file = (output_dir / subpath).with_suffix(suffix)
if not output_file.parent.exists():
output_file.parent.mkdir(parents=True)
if file_type == "json":
data = docs2json(docs) data = docs2json(docs)
srsly.write_json(output_file, docs2json(docs))
else: else:
data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes() data = DocBin(attrs=ALL_ATTRS, docs=docs).to_bytes()
if output_dir == "-":
if write_json:
srsly.write_json("-", data)
else:
sys.stdout.write(data)
else:
# Export data to a file
if write_json:
suffix = f".{file_type}"
output_file = output_dir / input_path.parts[-1].with_suffix(suffix)
srsly.write_json(output_file, data)
else:
output_file = output_dir / input_path.parts[-1].with_suffix("spacy")
with output_file.open("wb") as file_: with output_file.open("wb") as file_:
file_.write(data) file_.write(data)
msg.good(f"Generated output file ({len(data)} documents): {output_file}") msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
def autodetect_ner_format(input_data): def autodetect_ner_format(input_data):
@ -148,3 +114,102 @@ def autodetect_ner_format(input_data):
if format_guesses["ner"] == 0 and format_guesses["iob"] > 0: if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
return "iob" return "iob"
return None return None
def walk_directory(path):
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
else:
locs.append(path)
return locs
def verify_cli_args(
msg,
input_path,
output_dir,
file_type,
n_sents,
seg_sents,
model,
morphology,
merge_subtokens,
converter,
ner_map,
lang
):
if converter == "ner" or converter == "iob":
input_data = input_path.open("r", encoding="utf-8").read()
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":
msg.info("Auto-detected token-per-line NER format")
converter = converter_autodetect
elif converter_autodetect == "iob":
msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. Conversion may not",
"succeed. See https://spacy.io/api/cli#convert"
)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly?
msg.fail(
f"Can't write .{file_type} data to stdout",
"Please specify an output directory.",
exits=1,
)
if not input_path.exists():
msg.fail("Input file not found", input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists():
msg.fail("Output directory not found", output_dir, exits=1)
if input_path.is_dir():
input_locs = walk_directory(input_path)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2:
file_types = ",".join(file_types)
msg.fail("All input files must be same type", file_types, exits=1)
if converter == "auto":
converter = file_types[0]
else:
converter = input_path.suffix[1:]
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
return converter
def _get_converter(msg, converter, input_path):
if input_path.is_dir():
input_path = walk_directory(input_path)[0]
if converter == "auto":
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open() as file_:
input_data = file_.read()
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":
msg.info("Auto-detected token-per-line NER format")
converter = converter_autodetect
elif converter_autodetect == "iob":
msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. "
"Conversion may not succeed. "
"See https://spacy.io/api/cli#convert"
)
return converter