Update CLI args and docstrings

This commit is contained in:
Ines Montani 2020-07-09 19:44:28 +02:00
parent ac4297ee39
commit 05e182e421
4 changed files with 23 additions and 25 deletions

View File

@ -1,3 +1,4 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import subprocess
@ -24,22 +25,18 @@ DIRS = [
@project_cli.command("clone")
def project_clone_cli(
# fmt: off
name: str = Arg(..., help="The name of the template to fetch"),
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
# fmt: on
):
"""Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo
defaults to the official spaCy template repo, but can be customized
(including using a private repo). Setting the --git flag will also
initialize the project directory as a Git repo. If the project is intended
to be a Git repo, it should be initialized with Git first, before
initializing DVC (Data Version Control). This allows DVC to integrate with
Git.
(including using a private repo).
"""
if dest == Path.cwd():
dest = dest / name
if dest is None:
dest = Path.cwd() / name
project_clone(name, dest, repo=repo)

View File

@ -30,7 +30,7 @@ def project_update_dvc_cli(
"""Auto-generate Data Version Control (DVC) config. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. If no workflow is specified, the first defined
workflow is used. The DVC config will only be updated if
workflow is used. The DVC config will only be updated if the project.yml changed.
"""
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)

View File

@ -20,14 +20,14 @@ def project_run_cli(
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"),
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run a named script or workflow defined in the project.yml. If a workflow
"""Run a named command or workflow defined in the project.yml. If a workflow
name is specified, all commands in the workflow are run, in order. If
commands define inputs and/or outputs, they will only be re-run if state
has changed.
commands define dependencies and/or outputs, they will only be re-run if
state has changed.
"""
if show_help or not subcommand:
print_run_help(project_dir, subcommand)

View File

@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
@app.command("train")
def train_cli(
# fmt: off
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
train_path: Path = Arg(..., help="Location of training data", exists=True),
dev_path: Path = Arg(..., help="Location of development data", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
@ -203,8 +203,10 @@ def train(
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
train_examples = list(
corpus.train_dataset(
nlp, shuffle=False, gold_preproc=training["gold_preproc"],
max_length=training["max_length"]
nlp,
shuffle=False,
gold_preproc=training["gold_preproc"],
max_length=training["max_length"],
)
)
nlp.begin_training(lambda: train_examples)
@ -322,10 +324,7 @@ def create_train_batches(nlp, corpus, cfg):
discard_oversize=cfg["discard_oversize"],
)
else:
batches = util.minibatch(
train_examples,
size=cfg["batch_size"],
)
batches = util.minibatch(train_examples, size=cfg["batch_size"])
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try:
@ -438,7 +437,9 @@ def train_while_improving(
if raw_text:
random.shuffle(raw_text)
raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text]
raw_examples = [
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
]
raw_batches = util.minibatch(raw_examples, size=8)
for step, (epoch, batch) in enumerate(train_data):