diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 5e0cdfdf2..61438d1a8 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
from .._util import SimpleFrozenDict, parse_config_overrides
+# Whether assets are extra if `extra` is not set.
+EXTRA_DEFAULT = False
+
@project_cli.command(
"assets",
@@ -21,7 +24,8 @@ def project_assets_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
- sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
+ sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
+ extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
# fmt: on
):
"""Fetch project assets like datasets and pretrained weights. Assets are
@@ -32,7 +36,12 @@ def project_assets_cli(
DOCS: https://spacy.io/api/cli#project-assets
"""
overrides = parse_config_overrides(ctx.args)
- project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
+ project_assets(
+ project_dir,
+ overrides=overrides,
+ sparse_checkout=sparse_checkout,
+ extra=extra,
+ )
def project_assets(
@@ -40,17 +49,29 @@ def project_assets(
*,
overrides: Dict[str, Any] = SimpleFrozenDict(),
sparse_checkout: bool = False,
+ extra: bool = False,
) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
+ sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
+ needed.
+ extra (bool): Whether to download all assets, including those marked as 'extra'.
"""
project_path = ensure_path(project_dir)
config = load_project_config(project_path, overrides=overrides)
- assets = config.get("assets", {})
+ assets = [
+ asset
+ for asset in config.get("assets", [])
+ if extra or not asset.get("extra", EXTRA_DEFAULT)
+ ]
if not assets:
- msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
+ msg.warn(
+ f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
+ exits=0,
+ )
msg.info(f"Fetching {len(assets)} asset(s)")
+
for asset in assets:
dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum")
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 0fa6f5670..3ef56d9f6 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -341,6 +341,7 @@ def test_project_config_validation_full():
"assets": [
{
"dest": "x",
+ "extra": True,
"url": "https://example.com",
"checksum": "63373dd656daa1fd3043ce166a59474c",
},
@@ -352,6 +353,12 @@ def test_project_config_validation_full():
"path": "y",
},
},
+ {
+ "dest": "z",
+ "extra": False,
+ "url": "https://example.com",
+ "checksum": "63373dd656daa1fd3043ce166a59474c",
+ },
],
"commands": [
{
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 57d226913..fb6f05611 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -94,9 +94,8 @@ also use any private repo you have access to with Git.
Assets are data files your project needs – for example, the training and
evaluation data or pretrained vectors and embeddings to initialize your model
with. Each project template comes with a `project.yml` that defines the assets
-to download and where to put them. The
-[`spacy project assets`](/api/cli#project-assets) will fetch the project assets
-for you:
+to download and where to put them. The [`spacy project assets`](/api/cli#run)
+will fetch the project assets for you:
```cli
$ cd some_example_project
@@ -108,6 +107,11 @@ even cloud storage such as GCS and S3. You can also fetch assets using git, by
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
checkout" feature to avoid downloading the whole repository.
+Sometimes your project configuration may include large assets that you don't
+necessarily want to download when you run `spacy project assets`. That's why
+assets can be marked as [`extra`](#data-assets-url) - by default, these assets
+are not downloaded. If they should be, run `spacy project assets --extra`.
+
### 3. Run a command {#run}
> #### project.yml
@@ -215,9 +219,9 @@ pipelines.
> #### Tip: Multi-line YAML syntax for long values
>
-> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be
-> helpful for readability with longer values such as project descriptions or
-> commands that take several arguments.
+> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful
+> for readability with longer values such as project descriptions or commands
+> that take several arguments.
```yaml
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
@@ -261,8 +265,9 @@ dependencies to use certain protocols.
> - dest: 'assets/training.spacy'
> url: 'https://example.com/data.spacy'
> checksum: '63373dd656daa1fd3043ce166a59474c'
-> # Download from Google Cloud Storage bucket
+> # Optional download from Google Cloud Storage bucket
> - dest: 'assets/development.spacy'
+> extra: True
> url: 'gs://your-bucket/corpora'
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
> ```
@@ -270,6 +275,7 @@ dependencies to use certain protocols.
| Name | Description |
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
+| `extra` | Optional flag determining whether this asset is downloaded only if `spacy project assets` is run with `--extra`. `False` by default. |
| `url` | The URL to download from, using the respective protocol. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
@@ -294,12 +300,12 @@ files you need and not the whole repo.
> description: 'The training data (5000 examples)'
> ```
-| Name | Description |
-| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
+| Name | Description |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.
`branch`: The branch to download from. Defaults to `"master"`. |
-| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
-| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
+| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
+| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
#### Working with private assets {#data-asets-private}