diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 4c3adc5d3..1503acd24 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -23,6 +23,7 @@ from .project.run import project_run # noqa: F401 from .project.dvc import project_update_dvc # noqa: F401 from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 +from .project.document import project_document # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 723068106..b5335df51 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -154,6 +154,8 @@ def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> N if is_stdout: print(config.to_str()) else: + if not output_file.parent.exists(): + output_file.parent.mkdir(parents=True) config.to_disk(output_file, interpolate=False) msg.good("Saved config", output_file) msg.text("You can now add your data and train your model:") diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index f78c4f617..e33a82acc 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -10,15 +10,6 @@ from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_che from .._util import download_file, git_sparse_checkout -# TODO: find a solution for caches -# CACHES = [ -# Path.home() / ".torch", -# Path.home() / ".caches" / "torch", -# os.environ.get("TORCH_HOME"), -# Path.home() / ".keras", -# ] - - @project_cli.command("assets") def project_assets_cli( # fmt: off @@ -99,7 +90,6 @@ def fetch_asset( RETURNS (Optional[Path]): The path to the fetched asset or None if fetching the asset failed. """ - # TODO: add support for caches dest_path = (project_path / dest).resolve() if dest_path.exists() and checksum: # If there's already a file, check for checksum @@ -134,7 +124,7 @@ def convert_asset_url(url: str) -> str: RETURNS (str): The converted URL. """ # If the asset URL is a regular GitHub URL it's likely a mistake - if re.match(r"(http(s?)):\/\/github.com", url): + if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url: converted = url.replace("github.com", "raw.githubusercontent.com") converted = re.sub(r"/(tree|blob)/", "/", converted) msg.warn( diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py new file mode 100644 index 000000000..9c9008400 --- /dev/null +++ b/spacy/cli/project/document.py @@ -0,0 +1,140 @@ +from typing import Iterable, Optional +from pathlib import Path +from wasabi import msg + +from ...util import working_dir +from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config + + +DOCS_URL = "https://nightly.spacy.io" +INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the +project, as well as the available commands and workflows. For details, see the +[spaCy projects documentation]({DOCS_URL}/usage/projects).""" +INTRO_COMMANDS = f"""The following commands are defined by the project. They +can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run). +Commands are only re-run if their inputs have changed.""" +INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They +can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run) +and will run the specified commands in order. Commands are only re-run if their +inputs have changed.""" +INTRO_ASSETS = f"""The following assets are defined by the project. They can +be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets) +in the project directory.""" +# These markers are added to the Markdown and can be used to update the file in +# place if it already exists. Only the auto-generated part will be replaced. +MARKER_START = "" +MARKER_END = "" + + +@project_cli.command("document") +def project_document_cli( + # fmt: off + project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), + output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"), + no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji") + # fmt: on +): + """ + Auto-generate a README.md for a project. If the content is saved to a file, + hidden markers are added so you can add custom content before or after the + auto-generated section and only the auto-generated docs will be replaced + when you re-run the command. + """ + project_document(project_dir, output_file, no_emoji=no_emoji) + + +def project_document( + project_dir: Path, output_file: Path, *, no_emoji: bool = False +) -> None: + is_stdout = str(output_file) == "-" + config = load_project_config(project_dir) + md = MarkdownRenderer(no_emoji=no_emoji) + md.add(MARKER_START) + title = config.get("title") + description = config.get("description") + md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐")) + if description: + md.add(description) + md.add(md.title(2, PROJECT_FILE, "📋")) + md.add(INTRO_PROJECT) + # Commands + cmds = config.get("commands", []) + data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds] + if data: + md.add(md.title(3, "Commands", "⏯")) + md.add(INTRO_COMMANDS) + md.add(md.table(data, ["Command", "Description"])) + # Workflows + wfs = config.get("workflows", {}).items() + data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs] + if data: + md.add(md.title(3, "Workflows", "⏭")) + md.add(INTRO_WORKFLOWS) + md.add(md.table(data, ["Workflow", "Steps"])) + # Assets + assets = config.get("assets", []) + data = [] + for a in assets: + source = "Git" if a.get("git") else "URL" if a.get("url") else "Local" + dest_path = a["dest"] + dest = md.code(dest_path) + if source == "Local": + # Only link assets if they're in the repo + with working_dir(project_dir) as p: + if (p / dest_path).exists(): + dest = md.link(dest, dest_path) + data.append((dest, source, a.get("description", ""))) + if data: + md.add(md.title(3, "Assets", "🗂")) + md.add(INTRO_ASSETS) + md.add(md.table(data, ["File", "Source", "Description"])) + md.add(MARKER_END) + # Output result + if is_stdout: + print(md.text) + else: + content = md.text + if output_file.exists(): + with output_file.open("r", encoding="utf8") as f: + existing = f.read() + if MARKER_START in existing and MARKER_END in existing: + msg.info("Found existing file: only replacing auto-generated docs") + before = existing.split(MARKER_START)[0] + after = existing.split(MARKER_END)[1] + content = f"{before}{content}{after}" + else: + msg.info("Replacing existing file") + with output_file.open("w") as f: + f.write(content) + msg.good("Saved project documentation", output_file) + + +class MarkdownRenderer: + """Simple helper for generating raw Markdown.""" + + def __init__(self, no_emoji: bool = False): + self.data = [] + self.no_emoji = no_emoji + + @property + def text(self): + return "\n\n".join(self.data) + + def add(self, content: str) -> None: + self.data.append(content) + + def table(self, data: Iterable[Iterable[str]], header: Iterable[str]) -> str: + head = f"| {' | '.join(header)} |" + divider = f"| {' | '.join('---' for _ in header)} |" + body = "\n".join(f"| {' | '.join(row)} |" for row in data) + return f"{head}\n{divider}\n{body}" + + def title(self, level: int, text: str, emoji: Optional[str] = None) -> str: + prefix = f"{emoji} " if emoji and not self.no_emoji else "" + return f"{'#' * level} {prefix}{text}" + + def code(self, text: str) -> str: + return f"`{text}`" + + def link(self, text: str, url: str) -> str: + return f"[{text}]({url})" diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6e1deeeee..7b579314b 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -101,6 +101,9 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: print(f"For command details, run: {help_cmd}") else: print("") + title = config.get("title") + if title: + print(f"{title}\n") if config_commands: print(f"Available commands in {PROJECT_FILE}") print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") diff --git a/spacy/schemas.py b/spacy/schemas.py index 0fdb1b332..d5a9ac89a 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -296,6 +296,7 @@ class ProjectConfigAssetURL(BaseModel): dest: StrictStr = Field(..., title="Destination of downloaded asset") url: Optional[StrictStr] = Field(None, title="URL of asset") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + description: StrictStr = Field("", title="Description of asset") # fmt: on @@ -303,6 +304,7 @@ class ProjectConfigAssetGit(BaseModel): # fmt: off git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + description: Optional[StrictStr] = Field(None, title="Description of asset") # fmt: on @@ -328,6 +330,7 @@ class ProjectConfigSchema(BaseModel): assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") + title: Optional[str] = Field(None, title="Project title") # fmt: on class Config: diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 967a96dda..57c5fcc37 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -935,6 +935,41 @@ $ python -m spacy project pull [remote] [project_dir] | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. | +### project document {#project-document tag="command"} + +Auto-generate a pretty Markdown-formatted `README` for your project, based on +its [`project.yml`](/usage/projects#project-yml). Will create sections that +document the available commands, workflows and assets. The auto-generated +content will be placed between two hidden markers, so you can add your own +custom content before or after the auto-generated documentation. When you re-run +the `project document` command, only the auto-generated part is replaced. + +```cli +$ python -m spacy project document [project_dir] [--output] [--no-emoji] +``` + +> #### Example +> +> ```cli +> $ python -m spacy project document --output README.md +> ``` + + + +For more examples, see the templates in our +[`projects`](https://github.com/explosion/projects) repo. + +![Screenshot of auto-generated Markdown Readme](../images/project_document.jpg) + + + +| Name | Description | +| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--output`, `-o` | Path to output file or `-` for stdout (default). If a file is specified and it already exists and contains auto-generated docs, only the auto-generated docs section is replaced. ~~Path (positional)~~ | +|  `--no-emoji`, `-NE` | Don't use emoji in the titles. ~~bool (flag)~~ | +| **CREATES** | The Markdown-formatted project documentation. | + ### project dvc {#project-dvc tag="command"} Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls diff --git a/website/docs/images/project_document.jpg b/website/docs/images/project_document.jpg new file mode 100644 index 000000000..7942619a8 Binary files /dev/null and b/website/docs/images/project_document.jpg differ diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 6a32ab5d4..620526280 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -226,6 +226,8 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project. | Section | Description | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | +| `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | @@ -264,11 +266,12 @@ dependencies to use certain protocols. > checksum: '5113dc04e03f079525edd8df3f4f39e3' > ``` -| Name | Description | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | -| `url` | The URL to download from, using the respective protocol. | -| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| Name | Description | +| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `url` | The URL to download from, using the respective protocol. | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | #### Downloading from a Git repo {#data-assets-git} @@ -287,13 +290,15 @@ files you need and not the whole repo. > branch: 'master' > path: 'path/training.spacy' > checksum: '63373dd656daa1fd3043ce166a59474c' +> description: 'The training data (5000 examples)' > ``` -| Name | Description | -| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | -| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root.
`branch`: The branch to download from. Defaults to `"master"`. | -| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| Name | Description | +| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dest` | The destination path to save the downloaded asset to (relative to the project directory), including the file name. | +| `git` | `repo`: The URL of the repo to download from.
`path`: Path of the file or directory to download, relative to the repo root.
`branch`: The branch to download from. Defaults to `"master"`. | +| `checksum` | Optional checksum of the file. If provided, it will be used to verify that the file matches and downloads will be skipped if a local file with the same checksum already exists. | +| `description` | Optional asset description, used in [auto-generated docs](#custom-docs). | #### Working with private assets {#data-asets-private} @@ -488,12 +493,39 @@ vars: commands: - name: evaluate script: - - 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json' + - 'python scripts/custom_evaluation.py ${vars.batch_size} ./training/model-best ./corpus/eval.json' deps: - 'training/model-best' - 'corpus/eval.json' ``` +### Documenting your project {#custom-docs} + +> #### Readme Example +> +> For more examples, see the [`projects`](https://github.com/explosion/projects) +> repo. +> +> ![Screenshot of auto-generated Markdown Readme](../images/project_document.jpg) + +When your custom project is ready and you want to share it with others, you can +use the [`spacy project document`](/api/cli#project-document) command to +**auto-generate** a pretty, Markdown-formatted `README` file based on your +project's `project.yml`. It will list all commands, workflows and assets defined +in the project and include details on how to run the project, as well as links +to the relevant spaCy documentation to make it easy for others to get started +using your project. + +```cli +$ python -m spacy project document --output README.md +``` + +Under the hood, hidden markers are added to identify where the auto-generated +content starts and ends. This means that you can add your own custom content +before or after it and re-running the `project document` command will **only +update the auto-generated part**. This makes it easy to keep your documentation +up to date. + ### Cloning from your own repo {#custom-repo} The [`spacy project clone`](/api/cli#project-clone) command lets you customize