mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Improve convert command
This commit is contained in:
parent
a06a7f1f05
commit
3d7cd79f2d
|
@ -50,6 +50,7 @@ def convert_cli(
|
|||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -82,6 +83,7 @@ def convert_cli(
|
|||
converter=converter,
|
||||
ner_map=ner_map,
|
||||
lang=lang,
|
||||
concatenate=concatenate,
|
||||
silent=silent,
|
||||
msg=msg,
|
||||
)
|
||||
|
@ -100,13 +102,15 @@ def convert(
|
|||
converter: str = "auto",
|
||||
ner_map: Optional[Path] = None,
|
||||
lang: Optional[str] = None,
|
||||
concatenate: bool=False,
|
||||
silent: bool = True,
|
||||
msg: Optional[Printer],
|
||||
) -> None:
|
||||
if not msg:
|
||||
msg = Printer(no_print=silent)
|
||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||
for input_loc in walk_directory(Path(input_path)):
|
||||
doc_files = []
|
||||
for input_loc in walk_directory(Path(input_path), converter):
|
||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
|
@ -121,6 +125,13 @@ def convert(
|
|||
no_print=silent,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
doc_files.append((input_loc, docs))
|
||||
if concatenate:
|
||||
all_docs = []
|
||||
for _, docs in doc_files:
|
||||
all_docs.extend(docs)
|
||||
doc_files = [(input_path, all_docs)]
|
||||
for input_loc, docs in doc_files:
|
||||
if file_type == "json":
|
||||
data = [docs_to_json(docs)]
|
||||
else:
|
||||
|
@ -174,7 +185,7 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
def walk_directory(path: Path) -> List[Path]:
|
||||
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
|
@ -188,6 +199,12 @@ def walk_directory(path: Path) -> List[Path]:
|
|||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||
continue
|
||||
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||
continue
|
||||
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||
continue
|
||||
else:
|
||||
locs.append(path)
|
||||
return locs
|
||||
|
@ -214,11 +231,11 @@ def verify_cli_args(
|
|||
if ner_map is not None and not Path(ner_map).exists():
|
||||
msg.fail("NER map not found", ner_map, exits=1)
|
||||
if input_path.is_dir():
|
||||
input_locs = walk_directory(input_path)
|
||||
input_locs = walk_directory(input_path, converter)
|
||||
if len(input_locs) == 0:
|
||||
msg.fail("No input files in directory", input_path, exits=1)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if len(file_types) >= 2:
|
||||
if converter == "auto" and len(file_types) >= 2:
|
||||
file_types = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types, exits=1)
|
||||
if converter != "auto" and converter not in CONVERTERS:
|
||||
|
@ -227,7 +244,7 @@ def verify_cli_args(
|
|||
|
||||
def _get_converter(msg, converter, input_path):
|
||||
if input_path.is_dir():
|
||||
input_path = walk_directory(input_path)[0]
|
||||
input_path = walk_directory(input_path, converter)[0]
|
||||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
|
|
Loading…
Reference in New Issue
Block a user