mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Allow assets to be optional in spacy project (#10714)
* Allow assets to be optional in spacy project: draft for optional flag/download_all options. * Allow assets to be optional in spacy project: added OPTIONAL_DEFAULT reflecting default asset optionality. * Allow assets to be optional in spacy project: renamed --all to --extra. * Allow assets to be optional in spacy project: included optional flag in project config test. * Allow assets to be optional in spacy project: added documentation. * Allow assets to be optional in spacy project: fixing deprecated --all reference. Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Allow assets to be optional in spacy project: fixed project_assets() docstring. * Allow assets to be optional in spacy project: adjusted wording in justification of optional assets. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: switched to as keyword in project.yml. Updated docs. * Allow assets to be optional in spacy project: updated comment. * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in output. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in docstring.. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in test.. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in test. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: renamed OPTIONAL_DEFAULT to EXTRA_DEFAULT. Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
1543558d08
commit
2904359685
|
@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
|||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||
|
||||
# Whether assets are extra if `extra` is not set.
|
||||
EXTRA_DEFAULT = False
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
"assets",
|
||||
|
@ -21,7 +24,8 @@ def project_assets_cli(
|
|||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
|
||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
||||
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
||||
# fmt: on
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
|
@ -32,7 +36,12 @@ def project_assets_cli(
|
|||
DOCS: https://spacy.io/api/cli#project-assets
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
|
||||
project_assets(
|
||||
project_dir,
|
||||
overrides=overrides,
|
||||
sparse_checkout=sparse_checkout,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
|
||||
def project_assets(
|
||||
|
@ -40,17 +49,29 @@ def project_assets(
|
|||
*,
|
||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||
sparse_checkout: bool = False,
|
||||
extra: bool = False,
|
||||
) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
|
||||
needed.
|
||||
extra (bool): Whether to download all assets, including those marked as 'extra'.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path, overrides=overrides)
|
||||
assets = config.get("assets", {})
|
||||
assets = [
|
||||
asset
|
||||
for asset in config.get("assets", [])
|
||||
if extra or not asset.get("extra", EXTRA_DEFAULT)
|
||||
]
|
||||
if not assets:
|
||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||
msg.warn(
|
||||
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
|
||||
exits=0,
|
||||
)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
|
||||
for asset in assets:
|
||||
dest = (project_dir / asset["dest"]).resolve()
|
||||
checksum = asset.get("checksum")
|
||||
|
|
|
@ -341,6 +341,7 @@ def test_project_config_validation_full():
|
|||
"assets": [
|
||||
{
|
||||
"dest": "x",
|
||||
"extra": True,
|
||||
"url": "https://example.com",
|
||||
"checksum": "63373dd656daa1fd3043ce166a59474c",
|
||||
},
|
||||
|
@ -352,6 +353,12 @@ def test_project_config_validation_full():
|
|||
"path": "y",
|
||||
},
|
||||
},
|
||||
{
|
||||
"dest": "z",
|
||||
"extra": False,
|
||||
"url": "https://example.com",
|
||||
"checksum": "63373dd656daa1fd3043ce166a59474c",
|
||||
},
|
||||
],
|
||||
"commands": [
|
||||
{
|
||||
|
|
|
@ -94,9 +94,8 @@ also use any private repo you have access to with Git.
|
|||
Assets are data files your project needs – for example, the training and
|
||||
evaluation data or pretrained vectors and embeddings to initialize your model
|
||||
with. Each project template comes with a `project.yml` that defines the assets
|
||||
to download and where to put them. The
|
||||
[`spacy project assets`](/api/cli#project-assets) will fetch the project assets
|
||||
for you:
|
||||
to download and where to put them. The [`spacy project assets`](/api/cli#run)
|
||||
will fetch the project assets for you:
|
||||
|
||||
```cli
|
||||
$ cd some_example_project
|
||||
|
@ -108,6 +107,11 @@ even cloud storage such as GCS and S3. You can also fetch assets using git, by
|
|||
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
|
||||
checkout" feature to avoid downloading the whole repository.
|
||||
|
||||
Sometimes your project configuration may include large assets that you don't
|
||||
necessarily want to download when you run `spacy project assets`. That's why
|
||||
assets can be marked as [`extra`](#data-assets-url) - by default, these assets
|
||||
are not downloaded. If they should be, run `spacy project assets --extra`.
|
||||
|
||||
### 3. Run a command {#run}
|
||||
|
||||
> #### project.yml
|
||||
|
@ -215,9 +219,9 @@ pipelines.
|
|||
|
||||
> #### Tip: Multi-line YAML syntax for long values
|
||||
>
|
||||
> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be
|
||||
> helpful for readability with longer values such as project descriptions or
|
||||
> commands that take several arguments.
|
||||
> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful
|
||||
> for readability with longer values such as project descriptions or commands
|
||||
> that take several arguments.
|
||||
|
||||
```yaml
|
||||
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
||||
|
@ -261,8 +265,9 @@ dependencies to use certain protocols.
|
|||
> - dest: 'assets/training.spacy'
|
||||
> url: 'https://example.com/data.spacy'
|
||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||
> # Download from Google Cloud Storage bucket
|
||||
> # Optional download from Google Cloud Storage bucket
|
||||
> - dest: 'assets/development.spacy'
|
||||
> extra: True
|
||||
> url: 'gs://your-bucket/corpora'
|
||||
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||
> ```
|
||||
|
@ -270,6 +275,7 @@ dependencies to use certain protocols.
|
|||
| Name | Description |
|
||||
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||
| `extra` | Optional flag determining whether this asset is downloaded only if `spacy project assets` is run with `--extra`. `False` by default. |
|
||||
| `url` | The URL to download from, using the respective protocol. |
|
||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
||||
|
@ -294,12 +300,12 @@ files you need and not the whole repo.
|
|||
> description: 'The training data (5000 examples)'
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||
| Name | Description |
|
||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.<br />`branch`: The branch to download from. Defaults to `"master"`. |
|
||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
||||
|
||||
#### Working with private assets {#data-asets-private}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user