mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
fix processing of "auto" in convert (#12050)
* fix processing of "auto" in walk_directory * add check for None * move AUTO check to convert and fix verification of args * add specific CLI test with CliRunner * cleanup * more cleanup * update docstring
This commit is contained in:
parent
dbd829f0ed
commit
7f6c638c3a
|
@ -583,6 +583,10 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
|
||||||
|
|
||||||
|
|
||||||
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
|
||||||
|
"""Given a directory and a suffix, recursively find all files matching the suffix.
|
||||||
|
Directories or files with names beginning with a . are ignored, but hidden flags on
|
||||||
|
filesystems are not checked.
|
||||||
|
When provided with a suffix `None`, there is no suffix-based filtering."""
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
return [path]
|
return [path]
|
||||||
paths = [path]
|
paths = [path]
|
||||||
|
|
|
@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
|
||||||
"json": json_to_docs,
|
"json": json_to_docs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AUTO = "auto"
|
||||||
|
|
||||||
|
|
||||||
# File types that can be written to stdout
|
# File types that can be written to stdout
|
||||||
FILE_TYPES_STDOUT = ("json",)
|
FILE_TYPES_STDOUT = ("json",)
|
||||||
|
@ -49,7 +51,7 @@ def convert_cli(
|
||||||
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
|
||||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||||
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
|
||||||
|
@ -70,8 +72,8 @@ def convert_cli(
|
||||||
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
|
||||||
silent = output_dir == "-"
|
silent = output_dir == "-"
|
||||||
msg = Printer(no_print=silent)
|
msg = Printer(no_print=silent)
|
||||||
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
|
||||||
converter = _get_converter(msg, converter, input_path)
|
converter = _get_converter(msg, converter, input_path)
|
||||||
|
verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
|
||||||
convert(
|
convert(
|
||||||
input_path,
|
input_path,
|
||||||
output_dir,
|
output_dir,
|
||||||
|
@ -100,7 +102,7 @@ def convert(
|
||||||
model: Optional[str] = None,
|
model: Optional[str] = None,
|
||||||
morphology: bool = False,
|
morphology: bool = False,
|
||||||
merge_subtokens: bool = False,
|
merge_subtokens: bool = False,
|
||||||
converter: str = "auto",
|
converter: str,
|
||||||
ner_map: Optional[Path] = None,
|
ner_map: Optional[Path] = None,
|
||||||
lang: Optional[str] = None,
|
lang: Optional[str] = None,
|
||||||
concatenate: bool = False,
|
concatenate: bool = False,
|
||||||
|
@ -212,18 +214,22 @@ def verify_cli_args(
|
||||||
input_locs = walk_directory(input_path, converter)
|
input_locs = walk_directory(input_path, converter)
|
||||||
if len(input_locs) == 0:
|
if len(input_locs) == 0:
|
||||||
msg.fail("No input files in directory", input_path, exits=1)
|
msg.fail("No input files in directory", input_path, exits=1)
|
||||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
if converter not in CONVERTERS:
|
||||||
if converter == "auto" and len(file_types) >= 2:
|
|
||||||
file_types_str = ",".join(file_types)
|
|
||||||
msg.fail("All input files must be same type", file_types_str, exits=1)
|
|
||||||
if converter != "auto" and converter not in CONVERTERS:
|
|
||||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||||
|
|
||||||
|
|
||||||
def _get_converter(msg, converter, input_path: Path):
|
def _get_converter(msg, converter, input_path: Path):
|
||||||
if input_path.is_dir():
|
if input_path.is_dir():
|
||||||
input_path = walk_directory(input_path, converter)[0]
|
if converter == AUTO:
|
||||||
if converter == "auto":
|
input_locs = walk_directory(input_path, suffix=None)
|
||||||
|
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||||
|
if len(file_types) >= 2:
|
||||||
|
file_types_str = ",".join(file_types)
|
||||||
|
msg.fail("All input files must be same type", file_types_str, exits=1)
|
||||||
|
input_path = input_locs[0]
|
||||||
|
else:
|
||||||
|
input_path = walk_directory(input_path, suffix=converter)[0]
|
||||||
|
if converter == AUTO:
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if converter == "ner" or converter == "iob":
|
if converter == "ner" or converter == "iob":
|
||||||
with input_path.open(encoding="utf8") as file_:
|
with input_path.open(encoding="utf8") as file_:
|
||||||
|
|
|
@ -4,6 +4,7 @@ from collections import Counter
|
||||||
from typing import Tuple, List, Dict, Any
|
from typing import Tuple, List, Dict, Any
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -15,7 +16,7 @@ from thinc.api import Config, ConfigValidationError
|
||||||
|
|
||||||
from spacy import about
|
from spacy import about
|
||||||
from spacy.cli import info
|
from spacy.cli import info
|
||||||
from spacy.cli._util import is_subpath_of, load_project_config
|
from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
|
||||||
from spacy.cli._util import parse_config_overrides, string_to_list
|
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||||
from spacy.cli._util import substitute_project_variables
|
from spacy.cli._util import substitute_project_variables
|
||||||
from spacy.cli._util import validate_project_commands
|
from spacy.cli._util import validate_project_commands
|
||||||
|
@ -1185,3 +1186,26 @@ def test_upload_download_local_file():
|
||||||
download_file(remote_file, local_file)
|
download_file(remote_file, local_file)
|
||||||
with local_file.open(mode="r") as file_:
|
with local_file.open(mode="r") as file_:
|
||||||
assert file_.read() == content
|
assert file_.read() == content
|
||||||
|
|
||||||
|
|
||||||
|
def test_walk_directory():
|
||||||
|
with make_tempdir() as d:
|
||||||
|
files = [
|
||||||
|
"data1.iob",
|
||||||
|
"data2.iob",
|
||||||
|
"data3.json",
|
||||||
|
"data4.conll",
|
||||||
|
"data5.conll",
|
||||||
|
"data6.conll",
|
||||||
|
"data7.txt",
|
||||||
|
]
|
||||||
|
|
||||||
|
for f in files:
|
||||||
|
Path(d / f).touch()
|
||||||
|
|
||||||
|
assert (len(walk_directory(d))) == 7
|
||||||
|
assert (len(walk_directory(d, suffix=None))) == 7
|
||||||
|
assert (len(walk_directory(d, suffix="json"))) == 1
|
||||||
|
assert (len(walk_directory(d, suffix="iob"))) == 2
|
||||||
|
assert (len(walk_directory(d, suffix="conll"))) == 3
|
||||||
|
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
||||||
|
|
33
spacy/tests/test_cli_app.py
Normal file
33
spacy/tests/test_cli_app.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from spacy.cli._util import app
|
||||||
|
from .util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_auto():
|
||||||
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||||
|
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
||||||
|
Path(d_in / f).touch()
|
||||||
|
|
||||||
|
# ensure that "automatic" suffix detection works
|
||||||
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||||
|
assert "Generated output file" in result.stdout
|
||||||
|
out_files = os.listdir(d_out)
|
||||||
|
assert len(out_files) == 3
|
||||||
|
assert "data1.spacy" in out_files
|
||||||
|
assert "data2.spacy" in out_files
|
||||||
|
assert "data3.spacy" in out_files
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_auto_conflict():
|
||||||
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
||||||
|
for f in ["data1.iob", "data2.iob", "data3.json"]:
|
||||||
|
Path(d_in / f).touch()
|
||||||
|
|
||||||
|
# ensure that "automatic" suffix detection warns when there are different file types
|
||||||
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
||||||
|
assert "All input files must be same type" in result.stdout
|
||||||
|
out_files = os.listdir(d_out)
|
||||||
|
assert len(out_files) == 0
|
Loading…
Reference in New Issue
Block a user