diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index ee1fd790c..ca85bfb22 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -1,3 +1,4 @@ +from typing import Optional from pathlib import Path from wasabi import msg import subprocess @@ -24,22 +25,18 @@ DIRS = [ @project_cli.command("clone") def project_clone_cli( # fmt: off - name: str = Arg(..., help="The name of the template to fetch"), - dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False), - repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."), + name: str = Arg(..., help="The name of the template to clone"), + dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), + repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"), # fmt: on ): """Clone a project template from a repository. Calls into "git" and will only download the files from the given subdirectory. The GitHub repo defaults to the official spaCy template repo, but can be customized - (including using a private repo). Setting the --git flag will also - initialize the project directory as a Git repo. If the project is intended - to be a Git repo, it should be initialized with Git first, before - initializing DVC (Data Version Control). This allows DVC to integrate with - Git. + (including using a private repo). """ - if dest == Path.cwd(): - dest = dest / name + if dest is None: + dest = Path.cwd() / name project_clone(name, dest, repo=repo) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index a98cb939a..dce97179e 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -30,7 +30,7 @@ def project_update_dvc_cli( """Auto-generate Data Version Control (DVC) config. A DVC project can only define one pipeline, so you need to specify one workflow defined in the project.yml. If no workflow is specified, the first defined - workflow is used. The DVC config will only be updated if + workflow is used. The DVC config will only be updated if the project.yml changed. """ project_update_dvc(project_dir, workflow, verbose=verbose, force=force) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index a4d7dd644..db7633ade 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -20,14 +20,14 @@ def project_run_cli( subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), - dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute commands"), + dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"), show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") # fmt: on ): - """Run a named script or workflow defined in the project.yml. If a workflow + """Run a named command or workflow defined in the project.yml. If a workflow name is specified, all commands in the workflow are run, in order. If - commands define inputs and/or outputs, they will only be re-run if state - has changed. + commands define dependencies and/or outputs, they will only be re-run if + state has changed. """ if show_help or not subcommand: print_run_help(project_dir, subcommand) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 2f1556beb..6cf4d79c8 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -121,14 +121,14 @@ class ConfigSchema(BaseModel): @app.command("train") def train_cli( # fmt: off - train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True), - dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True), + train_path: Path = Arg(..., help="Location of training data", exists=True), + dev_path: Path = Arg(..., help="Location of development data", exists=True), config_path: Path = Arg(..., help="Path to config file", exists=True), output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store model in"), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), init_tok2vec: Optional[Path] = Opt(None, "--init-tok2vec", "-t2v", help="Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental."), raw_text: Optional[Path] = Opt(None, "--raw-text", "-rt", help="Path to jsonl file with unlabelled text documents."), - verbose: bool = Opt(False, "--verbose", "-VV", help="Display more information for debugging purposes"), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"), tag_map_path: Optional[Path] = Opt(None, "--tag-map-path", "-tm", help="Location of JSON-formatted tag map"), omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"), @@ -203,8 +203,10 @@ def train( msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") train_examples = list( corpus.train_dataset( - nlp, shuffle=False, gold_preproc=training["gold_preproc"], - max_length=training["max_length"] + nlp, + shuffle=False, + gold_preproc=training["gold_preproc"], + max_length=training["max_length"], ) ) nlp.begin_training(lambda: train_examples) @@ -322,10 +324,7 @@ def create_train_batches(nlp, corpus, cfg): discard_oversize=cfg["discard_oversize"], ) else: - batches = util.minibatch( - train_examples, - size=cfg["batch_size"], - ) + batches = util.minibatch(train_examples, size=cfg["batch_size"]) # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop try: @@ -438,7 +437,9 @@ def train_while_improving( if raw_text: random.shuffle(raw_text) - raw_examples = [Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text] + raw_examples = [ + Example.from_dict(nlp.make_doc(rt["text"]), {}) for rt in raw_text + ] raw_batches = util.minibatch(raw_examples, size=8) for step, (epoch, batch) in enumerate(train_data):