mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Improve convert command
This commit is contained in:
parent
a06a7f1f05
commit
3d7cd79f2d
|
@ -50,6 +50,7 @@ def convert_cli(
|
||||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||||
|
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -82,6 +83,7 @@ def convert_cli(
|
||||||
converter=converter,
|
converter=converter,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
lang=lang,
|
lang=lang,
|
||||||
|
concatenate=concatenate,
|
||||||
silent=silent,
|
silent=silent,
|
||||||
msg=msg,
|
msg=msg,
|
||||||
)
|
)
|
||||||
|
@ -100,13 +102,15 @@ def convert(
|
||||||
converter: str = "auto",
|
converter: str = "auto",
|
||||||
ner_map: Optional[Path] = None,
|
ner_map: Optional[Path] = None,
|
||||||
lang: Optional[str] = None,
|
lang: Optional[str] = None,
|
||||||
|
concatenate: bool=False,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
msg: Optional[Printer],
|
msg: Optional[Printer],
|
||||||
) -> None:
|
) -> None:
|
||||||
if not msg:
|
if not msg:
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||||
for input_loc in walk_directory(Path(input_path)):
|
doc_files = []
|
||||||
|
for input_loc in walk_directory(Path(input_path), converter):
|
||||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
input_data = input_loc.open("r", encoding="utf-8").read()
|
||||||
# Use converter function to convert data
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
|
@ -121,6 +125,13 @@ def convert(
|
||||||
no_print=silent,
|
no_print=silent,
|
||||||
ner_map=ner_map,
|
ner_map=ner_map,
|
||||||
)
|
)
|
||||||
|
doc_files.append((input_loc, docs))
|
||||||
|
if concatenate:
|
||||||
|
all_docs = []
|
||||||
|
for _, docs in doc_files:
|
||||||
|
all_docs.extend(docs)
|
||||||
|
doc_files = [(input_path, all_docs)]
|
||||||
|
for input_loc, docs in doc_files:
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
data = [docs_to_json(docs)]
|
data = [docs_to_json(docs)]
|
||||||
else:
|
else:
|
||||||
|
@ -174,7 +185,7 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def walk_directory(path: Path) -> List[Path]:
|
def walk_directory(path: Path, converter: str) -> List[Path]:
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
return [path]
|
return [path]
|
||||||
paths = [path]
|
paths = [path]
|
||||||
|
@ -188,6 +199,12 @@ def walk_directory(path: Path) -> List[Path]:
|
||||||
continue
|
continue
|
||||||
elif path.is_dir():
|
elif path.is_dir():
|
||||||
paths.extend(path.iterdir())
|
paths.extend(path.iterdir())
|
||||||
|
elif converter == "json" and not path.parts[-1].endswith("json"):
|
||||||
|
continue
|
||||||
|
elif converter == "conll" and not path.parts[-1].endswith("conll"):
|
||||||
|
continue
|
||||||
|
elif converter == "iob" and not path.parts[-1].endswith("iob"):
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
locs.append(path)
|
locs.append(path)
|
||||||
return locs
|
return locs
|
||||||
|
@ -214,11 +231,11 @@ def verify_cli_args(
|
||||||
if ner_map is not None and not Path(ner_map).exists():
|
if ner_map is not None and not Path(ner_map).exists():
|
||||||
msg.fail("NER map not found", ner_map, exits=1)
|
msg.fail("NER map not found", ner_map, exits=1)
|
||||||
if input_path.is_dir():
|
if input_path.is_dir():
|
||||||
input_locs = walk_directory(input_path)
|
input_locs = walk_directory(input_path, converter)
|
||||||
if len(input_locs) == 0:
|
if len(input_locs) == 0:
|
||||||
msg.fail("No input files in directory", input_path, exits=1)
|
msg.fail("No input files in directory", input_path, exits=1)
|
||||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||||
if len(file_types) >= 2:
|
if converter == "auto" and len(file_types) >= 2:
|
||||||
file_types = ",".join(file_types)
|
file_types = ",".join(file_types)
|
||||||
msg.fail("All input files must be same type", file_types, exits=1)
|
msg.fail("All input files must be same type", file_types, exits=1)
|
||||||
if converter != "auto" and converter not in CONVERTERS:
|
if converter != "auto" and converter not in CONVERTERS:
|
||||||
|
@ -227,7 +244,7 @@ def verify_cli_args(
|
||||||
|
|
||||||
def _get_converter(msg, converter, input_path):
|
def _get_converter(msg, converter, input_path):
|
||||||
if input_path.is_dir():
|
if input_path.is_dir():
|
||||||
input_path = walk_directory(input_path)[0]
|
input_path = walk_directory(input_path, converter)[0]
|
||||||
if converter == "auto":
|
if converter == "auto":
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if converter == "ner" or converter == "iob":
|
if converter == "ner" or converter == "iob":
|
||||||
|
|
Loading…
Reference in New Issue
Block a user