Improve convert command

This commit is contained in:
Matthew Honnibal 2020-08-24 22:23:20 +02:00
parent a06a7f1f05
commit 3d7cd79f2d

View File

@ -50,6 +50,7 @@ def convert_cli(
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
# fmt: on # fmt: on
): ):
""" """
@ -82,6 +83,7 @@ def convert_cli(
converter=converter, converter=converter,
ner_map=ner_map, ner_map=ner_map,
lang=lang, lang=lang,
concatenate=concatenate,
silent=silent, silent=silent,
msg=msg, msg=msg,
) )
@ -100,13 +102,15 @@ def convert(
converter: str = "auto", converter: str = "auto",
ner_map: Optional[Path] = None, ner_map: Optional[Path] = None,
lang: Optional[str] = None, lang: Optional[str] = None,
concatenate: bool=False,
silent: bool = True, silent: bool = True,
msg: Optional[Printer], msg: Optional[Printer],
) -> None: ) -> None:
if not msg: if not msg:
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None ner_map = srsly.read_json(ner_map) if ner_map is not None else None
for input_loc in walk_directory(Path(input_path)): doc_files = []
for input_loc in walk_directory(Path(input_path), converter):
input_data = input_loc.open("r", encoding="utf-8").read() input_data = input_loc.open("r", encoding="utf-8").read()
# Use converter function to convert data # Use converter function to convert data
func = CONVERTERS[converter] func = CONVERTERS[converter]
@ -121,6 +125,13 @@ def convert(
no_print=silent, no_print=silent,
ner_map=ner_map, ner_map=ner_map,
) )
doc_files.append((input_loc, docs))
if concatenate:
all_docs = []
for _, docs in doc_files:
all_docs.extend(docs)
doc_files = [(input_path, all_docs)]
for input_loc, docs in doc_files:
if file_type == "json": if file_type == "json":
data = [docs_to_json(docs)] data = [docs_to_json(docs)]
else: else:
@ -174,7 +185,7 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None return None
def walk_directory(path: Path) -> List[Path]: def walk_directory(path: Path, converter: str) -> List[Path]:
if not path.is_dir(): if not path.is_dir():
return [path] return [path]
paths = [path] paths = [path]
@ -188,6 +199,12 @@ def walk_directory(path: Path) -> List[Path]:
continue continue
elif path.is_dir(): elif path.is_dir():
paths.extend(path.iterdir()) paths.extend(path.iterdir())
elif converter == "json" and not path.parts[-1].endswith("json"):
continue
elif converter == "conll" and not path.parts[-1].endswith("conll"):
continue
elif converter == "iob" and not path.parts[-1].endswith("iob"):
continue
else: else:
locs.append(path) locs.append(path)
return locs return locs
@ -214,11 +231,11 @@ def verify_cli_args(
if ner_map is not None and not Path(ner_map).exists(): if ner_map is not None and not Path(ner_map).exists():
msg.fail("NER map not found", ner_map, exits=1) msg.fail("NER map not found", ner_map, exits=1)
if input_path.is_dir(): if input_path.is_dir():
input_locs = walk_directory(input_path) input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0: if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1) msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs])) file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2: if converter == "auto" and len(file_types) >= 2:
file_types = ",".join(file_types) file_types = ",".join(file_types)
msg.fail("All input files must be same type", file_types, exits=1) msg.fail("All input files must be same type", file_types, exits=1)
if converter != "auto" and converter not in CONVERTERS: if converter != "auto" and converter not in CONVERTERS:
@ -227,7 +244,7 @@ def verify_cli_args(
def _get_converter(msg, converter, input_path): def _get_converter(msg, converter, input_path):
if input_path.is_dir(): if input_path.is_dir():
input_path = walk_directory(input_path)[0] input_path = walk_directory(input_path, converter)[0]
if converter == "auto": if converter == "auto":
converter = input_path.suffix[1:] converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob": if converter == "ner" or converter == "iob":