Improve convert command

This commit is contained in:
Matthew Honnibal 2020-08-24 22:23:20 +02:00
parent a06a7f1f05
commit 3d7cd79f2d

View File

@ -50,6 +50,7 @@ def convert_cli(
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
# fmt: on
):
"""
@ -82,6 +83,7 @@ def convert_cli(
converter=converter,
ner_map=ner_map,
lang=lang,
concatenate=concatenate,
silent=silent,
msg=msg,
)
@ -100,13 +102,15 @@ def convert(
converter: str = "auto",
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
concatenate: bool=False,
silent: bool = True,
msg: Optional[Printer],
) -> None:
if not msg:
msg = Printer(no_print=silent)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
for input_loc in walk_directory(Path(input_path)):
doc_files = []
for input_loc in walk_directory(Path(input_path), converter):
input_data = input_loc.open("r", encoding="utf-8").read()
# Use converter function to convert data
func = CONVERTERS[converter]
@ -121,6 +125,13 @@ def convert(
no_print=silent,
ner_map=ner_map,
)
doc_files.append((input_loc, docs))
if concatenate:
all_docs = []
for _, docs in doc_files:
all_docs.extend(docs)
doc_files = [(input_path, all_docs)]
for input_loc, docs in doc_files:
if file_type == "json":
data = [docs_to_json(docs)]
else:
@ -174,7 +185,7 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None
def walk_directory(path: Path) -> List[Path]:
def walk_directory(path: Path, converter: str) -> List[Path]:
if not path.is_dir():
return [path]
paths = [path]
@ -188,6 +199,12 @@ def walk_directory(path: Path) -> List[Path]:
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif converter == "json" and not path.parts[-1].endswith("json"):
continue
elif converter == "conll" and not path.parts[-1].endswith("conll"):
continue
elif converter == "iob" and not path.parts[-1].endswith("iob"):
continue
else:
locs.append(path)
return locs
@ -214,11 +231,11 @@ def verify_cli_args(
if ner_map is not None and not Path(ner_map).exists():
msg.fail("NER map not found", ner_map, exits=1)
if input_path.is_dir():
input_locs = walk_directory(input_path)
input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2:
if converter == "auto" and len(file_types) >= 2:
file_types = ",".join(file_types)
msg.fail("All input files must be same type", file_types, exits=1)
if converter != "auto" and converter not in CONVERTERS:
@ -227,7 +244,7 @@ def verify_cli_args(
def _get_converter(msg, converter, input_path):
if input_path.is_dir():
input_path = walk_directory(input_path)[0]
input_path = walk_directory(input_path, converter)[0]
if converter == "auto":
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":