walk directories

2025-08-10 15:14:56 +03:00 · 2022-09-07 11:41:22 +00:00 · 2022-09-07 11:41:22 +00:00 · fd7e299967
commit fd7e299967
parent e84b295279
3 changed files with 91 additions and 78 deletions
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -404,7 +404,10 @@ def git_checkout(
            if not is_subpath_of(tmp_dir, source_path):
                err = f"'{subpath}' is a path outside of the cloned repository."
                msg.fail(err, repo, exits=1)
            if source_path.is_dir():
                shutil.copytree(str(source_path), str(dest))
            else:
                shutil.copy(str(source_path), str(dest))
        except FileNotFoundError:
            err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
            msg.fail(err, repo, exits=1)
@ -573,3 +576,26 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
        local_msg.info("Using CPU")
        if gpu_is_available():
            local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
 def walk_directory(path: Path, suffix: str) -> List[Path]:
    if not path.is_dir():
        return [path]
    paths = [path]
    locs = []
    seen = set()
    for path in paths:
        if str(path) in seen:
            continue
        seen.add(str(path))
        if path.parts[-1].startswith("."):
            continue
        elif path.is_dir():
            paths.extend(path.iterdir())
        elif not path.parts[-1].endswith(suffix):
            continue
        else:
            locs.append(path)
    # It's good to sort these, in case the ordering messes up cache.
    locs.sort()
    return locs
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -1,14 +1,15 @@
 import tqdm
-import sys
+from itertools import chain
 from ._util import app, Arg, Opt, setup_gpu, import_code
 from typing import Optional, Generator, Union
 from pathlib import Path
 from typing import Optional, Generator, Union
 from wasabi import msg
 from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
 from ..tokens import Doc, DocBin
 from ..vocab import Vocab
-from .. import util
+from ..util import ensure_path, load_model
 path_help = ("Location of the documents to predict on."
@ -23,52 +24,46 @@ code_help = ("Path to Python file with additional "
 gold_help = "Use gold preprocessing provided in the .spacy files"
-def _stream_data(
+def _stream_file(path: Path, vocab: Vocab) -> Generator[Union[Doc, str], None, None]:
    data_path: Path,
    vocab: Vocab,
    suffix: Optional[str] = None
 ) -> Generator[Union[str, Doc], None, None]:
    """
-    Load data which is either in a single file
+    Stream data from a single file. If the path points to
-    in .spacy or plain text format or multiple
+    a .spacy file then yield from the DocBin otherwise
-    text files in a directory. If a directory
+    yield each line of a text file. If a decoding error
-    is provided skip subdirectories and undecodeable
+    is encountered during reading the file exit.
    files.
    """
-    if not data_path.is_dir():
+    if not path.is_dir():
        # Yield from DocBin.
-        if data_path.suffix == ".spacy":
+        if path.suffix == ".spacy":
-            docbin = DocBin().from_disk(data_path)
+            docbin = DocBin().from_disk(path)
            for doc in docbin.get_docs(vocab):
                yield doc
        # Yield from text file
        else:
            try:
-                with open(data_path, 'r') as fin:
+                with open(path, 'r') as fin:
                    for line in fin:
                        yield line
            except UnicodeDecodeError as e:
                print(e)
                msg.warn(
-                    f"{data_path} could not be decoded.",
+                    f"{path} could not be decoded.",
                    exits=True
                )
-    else:
+
-        # Yield per one file in directory
+
-        for path in data_path.iterdir():
+def _maybe_read(path: Path) -> Union[str, None]:
-            if path.is_dir():
+    """
-                msg.warn(f"Skipping directory {path}")
+    Try to read the text file from the provided path.
-            elif suffix is not None and path.suffix != suffix:
+    When encoutering a decoding error just warn and pass.
-                print(suffix, path.suffix)
+    """
                msg.warn(f"Skipping file {path}")
            else:
    with open(path, 'r') as fin:
        try:
            text = fin.read()
-                        yield text
+            return text
        except UnicodeDecodeError as e:
            msg.warn(f"Skipping file {path}")
            print(e)
            return None
@app.command("apply")
@ -78,20 +73,27 @@ def apply_cli(
    data_path: Path = Arg(..., help=path_help, exists=True),
    output: Path = Arg(..., help=out_help, dir_okay=False),
    code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
-    use_gpu: Optional[int] = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
-    batch_size: Optional[int] = Opt(1, "--batch-size", "-b", help="Batch size."),
+    batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
-    n_process: Optional[int] = Opt(1, "--n-process", "-n", help="number of processors to use."),
+    n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use."),
-    suffix: Optional[str] = Opt(None, "--suffix", "-n", help="Only read files with file.suffix.")
+    suffix: str = Opt("", "--suffix", "-n", help="Only read files with file.suffix.")
 ):
    """
    Apply a trained pipeline to documents to get predictions.
    Expects a loadable spaCy pipeline and some data as input.
-    The input can be provided multiple formats. It can be a .spacy
+    The data can be provided multiple formats. It can be a single
-    file, a single text file with one document per line or a directory
+    .spacy file or a single text file with one document per line.
-    where each file is assumed to be plain text document.
+    A directory can also be provided in which case the 'suffix'
    argument is required. All paths pointing to files with the
    provided suffix will be recursively collected and processed.
    DOCS: https://spacy.io/api/cli#tba
    """
    if data_path.is_dir() and suffix == "":
        raise ValueError(
            "When the provided 'data_path' is a directory "
            "the --suffix argument has to be provided as well."
        )
    if suffix is not None:
        if not suffix.startswith("."):
            suffix = "." + suffix
@ -106,17 +108,29 @@ def apply(
    model: str,
    batch_size: int,
    n_process: int,
-    suffix: Optional[str]
+    suffix: str
 ):
-    data_path = util.ensure_path(data_path)
+    data_path = ensure_path(data_path)
-    output_path = util.ensure_path(output)
+    output_path = ensure_path(output)
    if not data_path.exists():
        msg.fail("Couldn't find data path.", data_path, exits=1)
-    nlp = util.load_model(model)
+    nlp = load_model(model)
    msg.good(f"Loaded model {model}")
    vocab = nlp.vocab
    docbin = DocBin()
-    datagen = _stream_data(data_path, vocab, suffix)
+    datagen: Union[
        Generator[Union[Doc, str], None, None],
        chain[Union[Doc, str]],
        filter[str]
    ]
    if not data_path.is_dir():
        datagen = _stream_file(data_path, vocab)
    else:
        paths = walk_directory(data_path, suffix)
        if suffix == ".spacy":
            datagen = chain(*[_stream_file(path, vocab) for path in paths])
        else:
            datagen = filter(None, (_maybe_read(path) for path in paths))
    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
        docbin.add(doc)
    if output_path.is_dir():
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
+from typing import Callable, Iterable, Mapping, Optional, Any, Union
 from enum import Enum
 from pathlib import Path
 from wasabi import Printer
@ -7,7 +7,7 @@ import re
 import sys
 import itertools
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, walk_directory
 from ..training import docs_to_json
 from ..tokens import Doc, DocBin
 from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
    return None
 def walk_directory(path: Path, converter: str) -> List[Path]:
    if not path.is_dir():
        return [path]
    paths = [path]
    locs = []
    seen = set()
    for path in paths:
        if str(path) in seen:
            continue
        seen.add(str(path))
        if path.parts[-1].startswith("."):
            continue
        elif path.is_dir():
            paths.extend(path.iterdir())
        elif converter == "json" and not path.parts[-1].endswith("json"):
            continue
        elif converter == "conll" and not path.parts[-1].endswith("conll"):
            continue
        elif converter == "iob" and not path.parts[-1].endswith("iob"):
            continue
        else:
            locs.append(path)
    # It's good to sort these, in case the ordering messes up cache.
    locs.sort()
    return locs
 def verify_cli_args(
    msg: Printer,
    input_path: Path,