Update CLI args and docstrings

2025-09-15 16:42:36 +03:00 · 2020-07-09 19:44:28 +02:00 · 2020-07-09 19:44:28 +02:00 · 05e182e421
commit 05e182e421
parent ac4297ee39
4 changed files with 23 additions and 25 deletions
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -1,3 +1,4 @@
 from typing import Optional
 from pathlib import Path
 from wasabi import msg
 import subprocess
@ -24,22 +25,18 @@ DIRS = [
@project_cli.command("clone")
 def project_clone_cli(
    # fmt: off
-    name: str = Arg(..., help="The name of the template to fetch"),
+    name: str = Arg(..., help="The name of the template to clone"),
-    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
+    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
-    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
+    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
    # fmt: on
 ):
    """Clone a project template from a repository. Calls into "git" and will
    only download the files from the given subdirectory. The GitHub repo
    defaults to the official spaCy template repo, but can be customized
-    (including using a private repo). Setting the --git flag will also
+    (including using a private repo).
    initialize the project directory as a Git repo. If the project is intended
    to be a Git repo, it should be initialized with Git first, before
    initializing DVC (Data Version Control). This allows DVC to integrate with
    Git.
    """
-    if dest == Path.cwd():
+    if dest is None:
-        dest = dest / name
+        dest = Path.cwd() / name
    project_clone(name, dest, repo=repo)
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@ -30,7 +30,7 @@ def project_update_dvc_cli(
    """Auto-generate Data Version Control (DVC) config. A DVC
    project can only define one pipeline, so you need to specify one workflow
    defined in the project.yml. If no workflow is specified, the first defined
-    workflow is used. The DVC config will only be updated if
+    workflow is used. The DVC config will only be updated if the project.yml changed.
    """
    project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@ -20,14 +20,14 @@ def project_run_cli(
    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
-    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
+    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
    # fmt: on
 ):
-    """Run a named script or workflow defined in the project.yml. If a workflow
+    """Run a named command or workflow defined in the project.yml. If a workflow
    name is specified, all commands in the workflow are run, in order. If
-    commands define inputs and/or outputs, they will only be re-run if state
+    commands define dependencies and/or outputs, they will only be re-run if
-    has changed.
+    state has changed.
    """
    if show_help or not subcommand:
        print_run_help(project_dir, subcommand)
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
@app.command("train")
 def train_cli(
    # fmt: off
-    train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
+    train_path: Path = Arg(..., help="Location of training data", exists=True),
-    dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
+    dev_path: Path = Arg(..., help="Location of development data", exists=True),
    config_path: Path = Arg(..., help="Path to config file", exists=True),
    output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
    raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
-    verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
    use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
    tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
    omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
@ -203,8 +203,10 @@ def train(
        msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
        train_examples = list(
            corpus.train_dataset(
-                nlp, shuffle=False, gold_preproc=training["gold_preproc"],
+                nlp,
-                max_length=training["max_length"]
+                shuffle=False,
                gold_preproc=training["gold_preproc"],
                max_length=training["max_length"],
            )
        )
        nlp.begin_training(lambda: train_examples)
@ -322,10 +324,7 @@ def create_train_batches(nlp, corpus, cfg):
                discard_oversize=cfg["discard_oversize"],
            )
        else:
-            batches = util.minibatch(
+            batches = util.minibatch(train_examples, size=cfg["batch_size"])
                train_examples,
                size=cfg["batch_size"],
            )
        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
        try:
@ -438,7 +437,9 @@ def train_while_improving(
    if raw_text:
        random.shuffle(raw_text)
-        raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text]
+        raw_examples = [
            Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
        ]
        raw_batches = util.minibatch(raw_examples, size=8)
    for step, (epoch, batch) in enumerate(train_data):