diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 5e0cdfdf2..61438d1a8 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config from .._util import get_checksum, download_file, git_checkout, get_git_version from .._util import SimpleFrozenDict, parse_config_overrides +# Whether assets are extra if `extra` is not set. +EXTRA_DEFAULT = False + @project_cli.command( "assets", @@ -21,7 +24,8 @@ def project_assets_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), - sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.") + sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."), + extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.") # fmt: on ): """Fetch project assets like datasets and pretrained weights. Assets are @@ -32,7 +36,12 @@ def project_assets_cli( DOCS: https://spacy.io/api/cli#project-assets """ overrides = parse_config_overrides(ctx.args) - project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout) + project_assets( + project_dir, + overrides=overrides, + sparse_checkout=sparse_checkout, + extra=extra, + ) def project_assets( @@ -40,17 +49,29 @@ def project_assets( *, overrides: Dict[str, Any] = SimpleFrozenDict(), sparse_checkout: bool = False, + extra: bool = False, ) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. + sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files + needed. + extra (bool): Whether to download all assets, including those marked as 'extra'. """ project_path = ensure_path(project_dir) config = load_project_config(project_path, overrides=overrides) - assets = config.get("assets", {}) + assets = [ + asset + for asset in config.get("assets", []) + if extra or not asset.get("extra", EXTRA_DEFAULT) + ] if not assets: - msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) + msg.warn( + f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)", + exits=0, + ) msg.info(f"Fetching {len(assets)} asset(s)") + for asset in assets: dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 0fa6f5670..3ef56d9f6 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -341,6 +341,7 @@ def test_project_config_validation_full(): "assets": [ { "dest": "x", + "extra": True, "url": "https://example.com", "checksum": "63373dd656daa1fd3043ce166a59474c", }, @@ -352,6 +353,12 @@ def test_project_config_validation_full(): "path": "y", }, }, + { + "dest": "z", + "extra": False, + "url": "https://example.com", + "checksum": "63373dd656daa1fd3043ce166a59474c", + }, ], "commands": [ { diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 57d226913..fb6f05611 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -94,9 +94,8 @@ also use any private repo you have access to with Git. Assets are data files your project needs – for example, the training and evaluation data or pretrained vectors and embeddings to initialize your model with. Each project template comes with a `project.yml` that defines the assets -to download and where to put them. The -[`spacy project assets`](/api/cli#project-assets) will fetch the project assets -for you: +to download and where to put them. The [`spacy project assets`](/api/cli#run) +will fetch the project assets for you: ```cli $ cd some_example_project @@ -108,6 +107,11 @@ even cloud storage such as GCS and S3. You can also fetch assets using git, by replacing the `url` string with a `git` block. spaCy will use Git's "sparse checkout" feature to avoid downloading the whole repository. +Sometimes your project configuration may include large assets that you don't +necessarily want to download when you run `spacy project assets`. That's why +assets can be marked as [`extra`](#data-assets-url) - by default, these assets +are not downloaded. If they should be, run `spacy project assets --extra`. + ### 3. Run a command {#run} > #### project.yml @@ -215,9 +219,9 @@ pipelines. > #### Tip: Multi-line YAML syntax for long values > -> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be -> helpful for readability with longer values such as project descriptions or -> commands that take several arguments. +> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful +> for readability with longer values such as project descriptions or commands +> that take several arguments. ```yaml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml @@ -261,8 +265,9 @@ dependencies to use certain protocols. > - dest: 'assets/training.spacy' > url: 'https://example.com/data.spacy' > checksum: '63373dd656daa1fd3043ce166a59474c' -> # Download from Google Cloud Storage bucket +> # Optional download from Google Cloud Storage bucket > - dest: 'assets/development.spacy' +> extra: True > url: 'gs://your-bucket/corpora' > checksum: '5113dc04e03f079525edd8df3f4f39e3' > ``` @@ -270,6 +275,7 @@ dependencies to use certain protocols. | Name | Description | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `extra` | Optional flag determining whether this asset is downloaded only if `spacy project assets` is run with `--extra`. `False` by default. | | `url` | The URL to download from, using the respective protocol. | | `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | | `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | @@ -294,12 +300,12 @@ files you need and not the whole repo. > description: 'The training data (5000 examples)' > ``` -| Name | Description | -| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | | `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. | -| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | -| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | #### Working with private assets {#data-asets-private}