Merge pull request #5974 from explosion/feature/project-document

This commit is contained in:
Ines Montani 2020-08-26 15:14:13 +02:00 committed by GitHub
commit 0997c30b9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 228 additions and 22 deletions

View File

@ -23,6 +23,7 @@ from .project.run import project_run # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.document import project_document # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -154,6 +154,8 @@ def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> N
if is_stdout:
print(config.to_str())
else:
if not output_file.parent.exists():
output_file.parent.mkdir(parents=True)
config.to_disk(output_file, interpolate=False)
msg.good("Saved config", output_file)
msg.text("You can now add your data and train your model:")

View File

@ -10,15 +10,6 @@ from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_che
from .._util import download_file, git_sparse_checkout
# TODO: find a solution for caches
# CACHES = [
# Path.home() / ".torch",
# Path.home() / ".caches" / "torch",
# os.environ.get("TORCH_HOME"),
# Path.home() / ".keras",
# ]
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
@ -99,7 +90,6 @@ def fetch_asset(
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
the asset failed.
"""
# TODO: add support for caches
dest_path = (project_path / dest).resolve()
if dest_path.exists() and checksum:
# If there's already a file, check for checksum
@ -134,7 +124,7 @@ def convert_asset_url(url: str) -> str:
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
if re.match(r"(http(s?)):\/\/github.com", url):
if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(

View File

@ -0,0 +1,140 @@
from typing import Iterable, Optional
from pathlib import Path
from wasabi import msg
from ...util import working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
DOCS_URL = "https://nightly.spacy.io"
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
project, as well as the available commands and workflows. For details, see the
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
INTRO_COMMANDS = f"""The following commands are defined by the project. They
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
Commands are only re-run if their inputs have changed."""
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
and will run the specified commands in order. Commands are only re-run if their
inputs have changed."""
INTRO_ASSETS = f"""The following assets are defined by the project. They can
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
in the project directory."""
# These markers are added to the Markdown and can be used to update the file in
# place if it already exists. Only the auto-generated part will be replaced.
MARKER_START = "<!-- AUTO-GENERATED DOCS START (do not remove) -->"
MARKER_END = "<!-- AUTO-GENERATED DOCS END (do not remove) -->"
@project_cli.command("document")
def project_document_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
# fmt: on
):
"""
Auto-generate a README.md for a project. If the content is saved to a file,
hidden markers are added so you can add custom content before or after the
auto-generated section and only the auto-generated docs will be replaced
when you re-run the command.
"""
project_document(project_dir, output_file, no_emoji=no_emoji)
def project_document(
project_dir: Path, output_file: Path, *, no_emoji: bool = False
) -> None:
is_stdout = str(output_file) == "-"
config = load_project_config(project_dir)
md = MarkdownRenderer(no_emoji=no_emoji)
md.add(MARKER_START)
title = config.get("title")
description = config.get("description")
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
if description:
md.add(description)
md.add(md.title(2, PROJECT_FILE, "📋"))
md.add(INTRO_PROJECT)
# Commands
cmds = config.get("commands", [])
data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
if data:
md.add(md.title(3, "Commands", ""))
md.add(INTRO_COMMANDS)
md.add(md.table(data, ["Command", "Description"]))
# Workflows
wfs = config.get("workflows", {}).items()
data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
if data:
md.add(md.title(3, "Workflows", ""))
md.add(INTRO_WORKFLOWS)
md.add(md.table(data, ["Workflow", "Steps"]))
# Assets
assets = config.get("assets", [])
data = []
for a in assets:
source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
dest_path = a["dest"]
dest = md.code(dest_path)
if source == "Local":
# Only link assets if they're in the repo
with working_dir(project_dir) as p:
if (p / dest_path).exists():
dest = md.link(dest, dest_path)
data.append((dest, source, a.get("description", "")))
if data:
md.add(md.title(3, "Assets", "🗂"))
md.add(INTRO_ASSETS)
md.add(md.table(data, ["File", "Source", "Description"]))
md.add(MARKER_END)
# Output result
if is_stdout:
print(md.text)
else:
content = md.text
if output_file.exists():
with output_file.open("r", encoding="utf8") as f:
existing = f.read()
if MARKER_START in existing and MARKER_END in existing:
msg.info("Found existing file: only replacing auto-generated docs")
before = existing.split(MARKER_START)[0]
after = existing.split(MARKER_END)[1]
content = f"{before}{content}{after}"
else:
msg.info("Replacing existing file")
with output_file.open("w") as f:
f.write(content)
msg.good("Saved project documentation", output_file)
class MarkdownRenderer:
"""Simple helper for generating raw Markdown."""
def __init__(self, no_emoji: bool = False):
self.data = []
self.no_emoji = no_emoji
@property
def text(self):
return "\n\n".join(self.data)
def add(self, content: str) -> None:
self.data.append(content)
def table(self, data: Iterable[Iterable[str]], header: Iterable[str]) -> str:
head = f"| {' | '.join(header)} |"
divider = f"| {' | '.join('---' for _ in header)} |"
body = "\n".join(f"| {' | '.join(row)} |" for row in data)
return f"{head}\n{divider}\n{body}"
def title(self, level: int, text: str, emoji: Optional[str] = None) -> str:
prefix = f"{emoji} " if emoji and not self.no_emoji else ""
return f"{'#' * level} {prefix}{text}"
def code(self, text: str) -> str:
return f"`{text}`"
def link(self, text: str, url: str) -> str:
return f"[{text}]({url})"

View File

@ -101,6 +101,9 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
print(f"For command details, run: {help_cmd}")
else:
print("")
title = config.get("title")
if title:
print(f"{title}\n")
if config_commands:
print(f"Available commands in {PROJECT_FILE}")
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")

View File

@ -296,6 +296,7 @@ class ProjectConfigAssetURL(BaseModel):
dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: StrictStr = Field("", title="Description of asset")
# fmt: on
@ -303,6 +304,7 @@ class ProjectConfigAssetGit(BaseModel):
# fmt: off
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: Optional[StrictStr] = Field(None, title="Description of asset")
# fmt: on
@ -328,6 +330,7 @@ class ProjectConfigSchema(BaseModel):
assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
title: Optional[str] = Field(None, title="Project title")
# fmt: on
class Config:

View File

@ -935,6 +935,41 @@ $ python -m spacy project pull [remote] [project_dir]
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. |
### project document {#project-document tag="command"}
Auto-generate a pretty Markdown-formatted `README` for your project, based on
its [`project.yml`](/usage/projects#project-yml). Will create sections that
document the available commands, workflows and assets. The auto-generated
content will be placed between two hidden markers, so you can add your own
custom content before or after the auto-generated documentation. When you re-run
the `project document` command, only the auto-generated part is replaced.
```cli
$ python -m spacy project document [project_dir] [--output] [--no-emoji]
```
> #### Example
>
> ```cli
> $ python -m spacy project document --output README.md
> ```
<Accordion title="Example output" spaced>
For more examples, see the templates in our
[`projects`](https://github.com/explosion/projects) repo.
![Screenshot of auto-generated Markdown Readme](../images/project_document.jpg)
</Accordion>
| Name | Description |
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
| `--output`, `-o` | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ |
|  `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~ |
| **CREATES** | The Markdown-formatted project documentation. |
### project dvc {#project-dvc tag="command"}
Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls

Binary file not shown.

After

Width:  |  Height:  |  Size: 588 KiB

View File

@ -226,6 +226,8 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.
| Section | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
@ -264,11 +266,12 @@ dependencies to use certain protocols.
> checksum: '5113dc04e03f079525edd8df3f4f39e3'
> ```
| Name | Description |
| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
| `url` | The URL to download from, using the respective protocol. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| Name | Description |
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
| `url` | The URL to download from, using the respective protocol. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
#### Downloading from a Git repo {#data-assets-git}
@ -287,13 +290,15 @@ files you need and not the whole repo.
> branch: 'master'
> path: 'path/training.spacy'
> checksum: '63373dd656daa1fd3043ce166a59474c'
> description: 'The training data (5000 examples)'
> ```
| Name | Description |
| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root.<br />`branch`: The branch to download from. Defaults to `"master"`. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| Name | Description |
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. |
| `git` | `repo`: The URL of the repo to download from.<br />`path`: Path of the file or directory to download, relative to the repo root.<br />`branch`: The branch to download from. Defaults to `"master"`. |
| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. |
| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). |
#### Working with private assets {#data-asets-private}
@ -488,12 +493,39 @@ vars:
commands:
- name: evaluate
script:
- 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json'
- 'python scripts/custom_evaluation.py ${vars.batch_size} ./training/model-best ./corpus/eval.json'
deps:
- 'training/model-best'
- 'corpus/eval.json'
```
### Documenting your project {#custom-docs}
> #### Readme Example
>
> For more examples, see the [`projects`](https://github.com/explosion/projects)
> repo.
>
> ![Screenshot of auto-generated Markdown Readme](../images/project_document.jpg)
When your custom project is ready and you want to share it with others, you can
use the [`spacy project document`](/api/cli#project-document) command to
**auto-generate** a pretty, Markdown-formatted `README` file based on your
project's `project.yml`. It will list all commands, workflows and assets defined
in the project and include details on how to run the project, as well as links
to the relevant spaCy documentation to make it easy for others to get started
using your project.
```cli
$ python -m spacy project document --output README.md
```
Under the hood, hidden markers are added to identify where the auto-generated
content starts and ends. This means that you can add your own custom content
before or after it and re-running the `project document` command will **only
update the auto-generated part**. This makes it easy to keep your documentation
up to date.
### Cloning from your own repo {#custom-repo}
The [`spacy project clone`](/api/cli#project-clone) command lets you customize