fix processing of "auto" in walk_directory

This commit is contained in:
svlandeg 2023-01-03 18:22:35 +01:00
parent 31c1beba78
commit a4248984e0
3 changed files with 36 additions and 8 deletions

View File

@ -29,6 +29,8 @@ if TYPE_CHECKING:
SDIST_SUFFIX = ".tar.gz"
WHEEL_SUFFIX = "-py3-none-any.whl"
AUTO = "auto"
PROJECT_FILE = "project.yml"
PROJECT_LOCK = "project.lock"
COMMAND = "python -m spacy"
@ -596,6 +598,8 @@ def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif suffix == AUTO:
locs.append(path)
elif suffix is not None and not path.parts[-1].endswith(suffix):
continue
else:

View File

@ -7,7 +7,7 @@ import re
import sys
import itertools
from ._util import app, Arg, Opt, walk_directory
from ._util import app, Arg, Opt, walk_directory, AUTO
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -49,7 +49,7 @@ def convert_cli(
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@ -100,7 +100,7 @@ def convert(
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
converter: str = "auto",
converter: str = AUTO,
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
concatenate: bool = False,
@ -213,17 +213,17 @@ def verify_cli_args(
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if converter == "auto" and len(file_types) >= 2:
if converter == AUTO and len(file_types) >= 2:
file_types_str = ",".join(file_types)
msg.fail("All input files must be same type", file_types_str, exits=1)
if converter != "auto" and converter not in CONVERTERS:
if converter != AUTO and converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir():
input_path = walk_directory(input_path, converter)[0]
if converter == "auto":
if converter == AUTO:
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open(encoding="utf8") as file_:

View File

@ -4,6 +4,7 @@ from collections import Counter
from typing import Tuple, List, Dict, Any
import pkg_resources
import time
from pathlib import Path
import spacy
import numpy
@ -15,11 +16,11 @@ from thinc.api import Config, ConfigValidationError
from spacy import about
from spacy.cli import info
from spacy.cli._util import is_subpath_of, load_project_config
from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands
from spacy.cli._util import upload_file, download_file
from spacy.cli._util import upload_file, download_file, AUTO
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
@ -1185,3 +1186,26 @@ def test_upload_download_local_file():
download_file(remote_file, local_file)
with local_file.open(mode="r") as file_:
assert file_.read() == content
def test_walk_directory():
with make_tempdir() as d:
files = [
"data1.iob",
"data2.iob",
"data3.json",
"data4.conll",
"data5.conll",
"data6.conll",
"data7.txt",
]
for f in files:
Path(d / f).touch()
assert (len(walk_directory(d))) == 7
assert (len(walk_directory(d, suffix="json"))) == 1
assert (len(walk_directory(d, suffix="iob"))) == 2
assert (len(walk_directory(d, suffix="conll"))) == 3
assert (len(walk_directory(d, suffix="pdf"))) == 0
assert (len(walk_directory(d, suffix=AUTO))) == 7