diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 864051240..91ec75339 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -50,6 +50,7 @@ def convert_cli( converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), + concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), # fmt: on ): """ @@ -82,6 +83,7 @@ def convert_cli( converter=converter, ner_map=ner_map, lang=lang, + concatenate=concatenate, silent=silent, msg=msg, ) @@ -100,13 +102,15 @@ def convert( converter: str = "auto", ner_map: Optional[Path] = None, lang: Optional[str] = None, + concatenate: bool=False, silent: bool = True, msg: Optional[Printer], ) -> None: if not msg: msg = Printer(no_print=silent) ner_map = srsly.read_json(ner_map) if ner_map is not None else None - for input_loc in walk_directory(Path(input_path)): + doc_files = [] + for input_loc in walk_directory(Path(input_path), converter): input_data = input_loc.open("r", encoding="utf-8").read() # Use converter function to convert data func = CONVERTERS[converter] @@ -121,6 +125,13 @@ def convert( no_print=silent, ner_map=ner_map, ) + doc_files.append((input_loc, docs)) + if concatenate: + all_docs = [] + for _, docs in doc_files: + all_docs.extend(docs) + doc_files = [(input_path, all_docs)] + for input_loc, docs in doc_files: if file_type == "json": data = [docs_to_json(docs)] else: @@ -174,7 +185,7 @@ def autodetect_ner_format(input_data: str) -> Optional[str]: return None -def walk_directory(path: Path) -> List[Path]: +def walk_directory(path: Path, converter: str) -> List[Path]: if not path.is_dir(): return [path] paths = [path] @@ -188,6 +199,12 @@ def walk_directory(path: Path) -> List[Path]: continue elif path.is_dir(): paths.extend(path.iterdir()) + elif converter == "json" and not path.parts[-1].endswith("json"): + continue + elif converter == "conll" and not path.parts[-1].endswith("conll"): + continue + elif converter == "iob" and not path.parts[-1].endswith("iob"): + continue else: locs.append(path) return locs @@ -214,11 +231,11 @@ def verify_cli_args( if ner_map is not None and not Path(ner_map).exists(): msg.fail("NER map not found", ner_map, exits=1) if input_path.is_dir(): - input_locs = walk_directory(input_path) + input_locs = walk_directory(input_path, converter) if len(input_locs) == 0: msg.fail("No input files in directory", input_path, exits=1) file_types = list(set([loc.suffix[1:] for loc in input_locs])) - if len(file_types) >= 2: + if converter == "auto" and len(file_types) >= 2: file_types = ",".join(file_types) msg.fail("All input files must be same type", file_types, exits=1) if converter != "auto" and converter not in CONVERTERS: @@ -227,7 +244,7 @@ def verify_cli_args( def _get_converter(msg, converter, input_path): if input_path.is_dir(): - input_path = walk_directory(input_path)[0] + input_path = walk_directory(input_path, converter)[0] if converter == "auto": converter = input_path.suffix[1:] if converter == "ner" or converter == "iob":