mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Allow assets to be optional in spacy project (#10714)
* Allow assets to be optional in spacy project: draft for optional flag/download_all options. * Allow assets to be optional in spacy project: added OPTIONAL_DEFAULT reflecting default asset optionality. * Allow assets to be optional in spacy project: renamed --all to --extra. * Allow assets to be optional in spacy project: included optional flag in project config test. * Allow assets to be optional in spacy project: added documentation. * Allow assets to be optional in spacy project: fixing deprecated --all reference. Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Allow assets to be optional in spacy project: fixed project_assets() docstring. * Allow assets to be optional in spacy project: adjusted wording in justification of optional assets. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: switched to as keyword in project.yml. Updated docs. * Allow assets to be optional in spacy project: updated comment. * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in output. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in docstring.. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in test.. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: replacing 'optional' with 'extra' in test. Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Allow assets to be optional in spacy project: renamed OPTIONAL_DEFAULT to EXTRA_DEFAULT. Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
1543558d08
commit
2904359685
|
@ -12,6 +12,9 @@ from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||||
|
|
||||||
|
# Whether assets are extra if `extra` is not set.
|
||||||
|
EXTRA_DEFAULT = False
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
@project_cli.command(
|
||||||
"assets",
|
"assets",
|
||||||
|
@ -21,7 +24,8 @@ def project_assets_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
ctx: typer.Context, # This is only used to read additional arguments
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+.")
|
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
||||||
|
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||||
|
@ -32,7 +36,12 @@ def project_assets_cli(
|
||||||
DOCS: https://spacy.io/api/cli#project-assets
|
DOCS: https://spacy.io/api/cli#project-assets
|
||||||
"""
|
"""
|
||||||
overrides = parse_config_overrides(ctx.args)
|
overrides = parse_config_overrides(ctx.args)
|
||||||
project_assets(project_dir, overrides=overrides, sparse_checkout=sparse_checkout)
|
project_assets(
|
||||||
|
project_dir,
|
||||||
|
overrides=overrides,
|
||||||
|
sparse_checkout=sparse_checkout,
|
||||||
|
extra=extra,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def project_assets(
|
def project_assets(
|
||||||
|
@ -40,17 +49,29 @@ def project_assets(
|
||||||
*,
|
*,
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||||||
sparse_checkout: bool = False,
|
sparse_checkout: bool = False,
|
||||||
|
extra: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Fetch assets for a project using DVC if possible.
|
"""Fetch assets for a project using DVC if possible.
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
project_dir (Path): Path to project directory.
|
||||||
|
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
|
||||||
|
needed.
|
||||||
|
extra (bool): Whether to download all assets, including those marked as 'extra'.
|
||||||
"""
|
"""
|
||||||
project_path = ensure_path(project_dir)
|
project_path = ensure_path(project_dir)
|
||||||
config = load_project_config(project_path, overrides=overrides)
|
config = load_project_config(project_path, overrides=overrides)
|
||||||
assets = config.get("assets", {})
|
assets = [
|
||||||
|
asset
|
||||||
|
for asset in config.get("assets", [])
|
||||||
|
if extra or not asset.get("extra", EXTRA_DEFAULT)
|
||||||
|
]
|
||||||
if not assets:
|
if not assets:
|
||||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
msg.warn(
|
||||||
|
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
|
||||||
|
exits=0,
|
||||||
|
)
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||||
|
|
||||||
for asset in assets:
|
for asset in assets:
|
||||||
dest = (project_dir / asset["dest"]).resolve()
|
dest = (project_dir / asset["dest"]).resolve()
|
||||||
checksum = asset.get("checksum")
|
checksum = asset.get("checksum")
|
||||||
|
|
|
@ -341,6 +341,7 @@ def test_project_config_validation_full():
|
||||||
"assets": [
|
"assets": [
|
||||||
{
|
{
|
||||||
"dest": "x",
|
"dest": "x",
|
||||||
|
"extra": True,
|
||||||
"url": "https://example.com",
|
"url": "https://example.com",
|
||||||
"checksum": "63373dd656daa1fd3043ce166a59474c",
|
"checksum": "63373dd656daa1fd3043ce166a59474c",
|
||||||
},
|
},
|
||||||
|
@ -352,6 +353,12 @@ def test_project_config_validation_full():
|
||||||
"path": "y",
|
"path": "y",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"dest": "z",
|
||||||
|
"extra": False,
|
||||||
|
"url": "https://example.com",
|
||||||
|
"checksum": "63373dd656daa1fd3043ce166a59474c",
|
||||||
|
},
|
||||||
],
|
],
|
||||||
"commands": [
|
"commands": [
|
||||||
{
|
{
|
||||||
|
|
|
@ -94,9 +94,8 @@ also use any private repo you have access to with Git.
|
||||||
Assets are data files your project needs – for example, the training and
|
Assets are data files your project needs – for example, the training and
|
||||||
evaluation data or pretrained vectors and embeddings to initialize your model
|
evaluation data or pretrained vectors and embeddings to initialize your model
|
||||||
with. Each project template comes with a `project.yml` that defines the assets
|
with. Each project template comes with a `project.yml` that defines the assets
|
||||||
to download and where to put them. The
|
to download and where to put them. The [`spacy project assets`](/api/cli#run)
|
||||||
[`spacy project assets`](/api/cli#project-assets) will fetch the project assets
|
will fetch the project assets for you:
|
||||||
for you:
|
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ cd some_example_project
|
$ cd some_example_project
|
||||||
|
@ -108,6 +107,11 @@ even cloud storage such as GCS and S3. You can also fetch assets using git, by
|
||||||
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
|
replacing the `url` string with a `git` block. spaCy will use Git's "sparse
|
||||||
checkout" feature to avoid downloading the whole repository.
|
checkout" feature to avoid downloading the whole repository.
|
||||||
|
|
||||||
|
Sometimes your project configuration may include large assets that you don't
|
||||||
|
necessarily want to download when you run `spacy project assets`. That's why
|
||||||
|
assets can be marked as [`extra`](#data-assets-url) - by default, these assets
|
||||||
|
are not downloaded. If they should be, run `spacy project assets --extra`.
|
||||||
|
|
||||||
### 3. Run a command {#run}
|
### 3. Run a command {#run}
|
||||||
|
|
||||||
> #### project.yml
|
> #### project.yml
|
||||||
|
@ -215,9 +219,9 @@ pipelines.
|
||||||
|
|
||||||
> #### Tip: Multi-line YAML syntax for long values
|
> #### Tip: Multi-line YAML syntax for long values
|
||||||
>
|
>
|
||||||
> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be
|
> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be helpful
|
||||||
> helpful for readability with longer values such as project descriptions or
|
> for readability with longer values such as project descriptions or commands
|
||||||
> commands that take several arguments.
|
> that take several arguments.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
||||||
|
@ -261,8 +265,9 @@ dependencies to use certain protocols.
|
||||||
> - dest: 'assets/training.spacy'
|
> - dest: 'assets/training.spacy'
|
||||||
> url: 'https://example.com/data.spacy'
|
> url: 'https://example.com/data.spacy'
|
||||||
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
> checksum: '63373dd656daa1fd3043ce166a59474c'
|
||||||
> # Download from Google Cloud Storage bucket
|
> # Optional download from Google Cloud Storage bucket
|
||||||
> - dest: 'assets/development.spacy'
|
> - dest: 'assets/development.spacy'
|
||||||
|
> extra: True
|
||||||
> url: 'gs://your-bucket/corpora'
|
> url: 'gs://your-bucket/corpora'
|
||||||
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
|
||||||
> ```
|
> ```
|
||||||
|
@ -270,6 +275,7 @@ dependencies to use certain protocols.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||||
|
| `extra` | Optional flag determining whether this asset is downloaded only if `spacy project assets` is run with `--extra`. `False` by default. |
|
||||||
| `url` | The URL to download from, using the respective protocol. |
|
| `url` | The URL to download from, using the respective protocol. |
|
||||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||||
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
||||||
|
@ -294,12 +300,12 @@ files you need and not the whole repo.
|
||||||
> description: 'The training data (5000 examples)'
|
> description: 'The training data (5000 examples)'
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
|
||||||
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.<br />`branch`: The branch to download from. Defaults to `"master"`. |
|
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root. "" specifies the root directory.<br />`branch`: The branch to download from. Defaults to `"master"`. |
|
||||||
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
|
||||||
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
|
||||||
|
|
||||||
#### Working with private assets {#data-asets-private}
|
#### Working with private assets {#data-asets-private}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user