From a4248984e01ea77a12a4e4a7747359dd7d2490e5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 3 Jan 2023 18:22:35 +0100 Subject: [PATCH] fix processing of "auto" in walk_directory --- spacy/cli/_util.py | 4 ++++ spacy/cli/convert.py | 12 ++++++------ spacy/tests/test_cli.py | 28 ++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c46abffe5..f380e549f 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -29,6 +29,8 @@ if TYPE_CHECKING: SDIST_SUFFIX = ".tar.gz" WHEEL_SUFFIX = "-py3-none-any.whl" +AUTO = "auto" + PROJECT_FILE = "project.yml" PROJECT_LOCK = "project.lock" COMMAND = "python -m spacy" @@ -596,6 +598,8 @@ def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]: continue elif path.is_dir(): paths.extend(path.iterdir()) + elif suffix == AUTO: + locs.append(path) elif suffix is not None and not path.parts[-1].endswith(suffix): continue else: diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 7f365ae2c..c80ba38c1 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,7 +7,7 @@ import re import sys import itertools -from ._util import app, Arg, Opt, walk_directory +from ._util import app, Arg, Opt, walk_directory, AUTO from ..training import docs_to_json from ..tokens import Doc, DocBin from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs @@ -49,7 +49,7 @@ def convert_cli( model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"), morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), - converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), + converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), @@ -100,7 +100,7 @@ def convert( model: Optional[str] = None, morphology: bool = False, merge_subtokens: bool = False, - converter: str = "auto", + converter: str = AUTO, ner_map: Optional[Path] = None, lang: Optional[str] = None, concatenate: bool = False, @@ -213,17 +213,17 @@ def verify_cli_args( if len(input_locs) == 0: msg.fail("No input files in directory", input_path, exits=1) file_types = list(set([loc.suffix[1:] for loc in input_locs])) - if converter == "auto" and len(file_types) >= 2: + if converter == AUTO and len(file_types) >= 2: file_types_str = ",".join(file_types) msg.fail("All input files must be same type", file_types_str, exits=1) - if converter != "auto" and converter not in CONVERTERS: + if converter != AUTO and converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1) def _get_converter(msg, converter, input_path: Path): if input_path.is_dir(): input_path = walk_directory(input_path, converter)[0] - if converter == "auto": + if converter == AUTO: converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": with input_path.open(encoding="utf8") as file_: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c6768a3fd..b863c6c8f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -4,6 +4,7 @@ from collections import Counter from typing import Tuple, List, Dict, Any import pkg_resources import time +from pathlib import Path import spacy import numpy @@ -15,11 +16,11 @@ from thinc.api import Config, ConfigValidationError from spacy import about from spacy.cli import info -from spacy.cli._util import is_subpath_of, load_project_config +from spacy.cli._util import is_subpath_of, load_project_config, walk_directory from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands -from spacy.cli._util import upload_file, download_file +from spacy.cli._util import upload_file, download_file, AUTO from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -1185,3 +1186,26 @@ def test_upload_download_local_file(): download_file(remote_file, local_file) with local_file.open(mode="r") as file_: assert file_.read() == content + + +def test_walk_directory(): + with make_tempdir() as d: + files = [ + "data1.iob", + "data2.iob", + "data3.json", + "data4.conll", + "data5.conll", + "data6.conll", + "data7.txt", + ] + + for f in files: + Path(d / f).touch() + + assert (len(walk_directory(d))) == 7 + assert (len(walk_directory(d, suffix="json"))) == 1 + assert (len(walk_directory(d, suffix="iob"))) == 2 + assert (len(walk_directory(d, suffix="conll"))) == 3 + assert (len(walk_directory(d, suffix="pdf"))) == 0 + assert (len(walk_directory(d, suffix=AUTO))) == 7