Update CLI args and docstrings

This commit is contained in:
Ines Montani 2020-07-09 19:44:28 +02:00
parent ac4297ee39
commit 05e182e421
4 changed files with 23 additions and 25 deletions

View File

@ -1,3 +1,4 @@
from typing import Optional
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import subprocess import subprocess
@ -24,22 +25,18 @@ DIRS = [
@project_cli.command("clone") @project_cli.command("clone")
def project_clone_cli( def project_clone_cli(
# fmt: off # fmt: off
name: str = Arg(..., help="The name of the template to fetch"), name: str = Arg(..., help="The name of the template to clone"),
dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
# fmt: on # fmt: on
): ):
"""Clone a project template from a repository. Calls into "git" and will """Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo only download the files from the given subdirectory. The GitHub repo
defaults to the official spaCy template repo, but can be customized defaults to the official spaCy template repo, but can be customized
(including using a private repo). Setting the --git flag will also (including using a private repo).
initialize the project directory as a Git repo. If the project is intended
to be a Git repo, it should be initialized with Git first, before
initializing DVC (Data Version Control). This allows DVC to integrate with
Git.
""" """
if dest == Path.cwd(): if dest is None:
dest = dest / name dest = Path.cwd() / name
project_clone(name, dest, repo=repo) project_clone(name, dest, repo=repo)

View File

@ -30,7 +30,7 @@ def project_update_dvc_cli(
"""Auto-generate Data Version Control (DVC) config. A DVC """Auto-generate Data Version Control (DVC) config. A DVC
project can only define one pipeline, so you need to specify one workflow project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. If no workflow is specified, the first defined defined in the project.yml. If no workflow is specified, the first defined
workflow is used. The DVC config will only be updated if workflow is used. The DVC config will only be updated if the project.yml changed.
""" """
project_update_dvc(project_dir, workflow, verbose=verbose, force=force) project_update_dvc(project_dir, workflow, verbose=verbose, force=force)

View File

@ -20,14 +20,14 @@ def project_run_cli(
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"), dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on # fmt: on
): ):
"""Run a named script or workflow defined in the project.yml. If a workflow """Run a named command or workflow defined in the project.yml. If a workflow
name is specified, all commands in the workflow are run, in order. If name is specified, all commands in the workflow are run, in order. If
commands define inputs and/or outputs, they will only be re-run if state commands define dependencies and/or outputs, they will only be re-run if
has changed. state has changed.
""" """
if show_help or not subcommand: if show_help or not subcommand:
print_run_help(project_dir, subcommand) print_run_help(project_dir, subcommand)

View File

@ -121,14 +121,14 @@ class ConfigSchema(BaseModel):
@app.command("train") @app.command("train")
def train_cli( def train_cli(
# fmt: off # fmt: off
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), train_path: Path = Arg(..., help="Location of training data", exists=True),
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), dev_path: Path = Arg(..., help="Location of development data", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."),
raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."),
verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"),
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
@ -203,8 +203,10 @@ def train(
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
train_examples = list( train_examples = list(
corpus.train_dataset( corpus.train_dataset(
nlp, shuffle=False, gold_preproc=training["gold_preproc"], nlp,
max_length=training["max_length"] shuffle=False,
gold_preproc=training["gold_preproc"],
max_length=training["max_length"],
) )
) )
nlp.begin_training(lambda: train_examples) nlp.begin_training(lambda: train_examples)
@ -322,10 +324,7 @@ def create_train_batches(nlp, corpus, cfg):
discard_oversize=cfg["discard_oversize"], discard_oversize=cfg["discard_oversize"],
) )
else: else:
batches = util.minibatch( batches = util.minibatch(train_examples, size=cfg["batch_size"])
train_examples,
size=cfg["batch_size"],
)
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
try: try:
@ -438,7 +437,9 @@ def train_while_improving(
if raw_text: if raw_text:
random.shuffle(raw_text) random.shuffle(raw_text)
raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text] raw_examples = [
Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text
]
raw_batches = util.minibatch(raw_examples, size=8) raw_batches = util.minibatch(raw_examples, size=8)
for step, (epoch, batch) in enumerate(train_data): for step, (epoch, batch) in enumerate(train_data):