From da504737012e5c92c191b8ad37072f2122ec8666 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Mon, 29 Jun 2020 12:17:41 +0200 Subject: [PATCH 1/7] Tweak efficiency of arc_eager.set_costs --- spacy/syntax/arc_eager.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index fcc05de3f..f129ee7d1 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -729,12 +729,13 @@ cdef class ArcEager(TransitionSystem): cdef ArcEagerGold gold_ = gold gold_.update(stcls) gold_state = gold_.c - n_gold = 0 + cdef int n_gold = 0 for i in range(self.n_moves): if self.c[i].is_valid(stcls.c, self.c[i].label): is_valid[i] = True costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) - n_gold += costs[i] <= 0 + if costs[i] <= 0: + n_gold += 1 else: is_valid[i] = False costs[i] = 9000 From 24664efa234a7c1138858a4e3734ce234e819371 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 29 Jun 2020 16:54:19 +0200 Subject: [PATCH 2/7] Import project_run_all function --- spacy/cli/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index db409f431..5dc3070b6 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -16,6 +16,7 @@ from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 from .project import project_clone, project_assets, project_run # noqa: F401 +from .project import project_run_all # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) From 7c08713baa75094726ebafbd8c092911b72f839f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 29 Jun 2020 16:54:47 +0200 Subject: [PATCH 3/7] Improve error messages --- spacy/cli/project.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 5011a13f9..c02c1cf98 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any, Optional, Sequence import typer import srsly from pathlib import Path @@ -372,8 +372,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: config_commands = config.get("commands", []) commands = {cmd["name"]: cmd for cmd in config_commands} if subcommand: - if subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + validate_subcommand(commands.keys(), subcommand) print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") help_text = commands[subcommand].get("help") if help_text: @@ -401,8 +400,7 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None: config_commands = config.get("commands", []) variables = config.get("variables", {}) commands = {cmd["name"]: cmd for cmd in config_commands} - if subcommand not in commands: - msg.fail(f"Can't find command '{subcommand}' in project config", exits=1) + validate_subcommand(commands.keys(), subcommand) if subcommand in config.get("run", []): # This is one of the pipeline commands tracked in DVC dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] @@ -448,10 +446,14 @@ def load_project_config(path: Path) -> Dict[str, Any]: config_path = path / CONFIG_FILE if not config_path.exists(): msg.fail("Can't find project config", config_path, exits=1) - config = srsly.read_yaml(config_path) + invalid_err = f"Invalid project config in {CONFIG_FILE}" + try: + config = srsly.read_yaml(config_path) + except ValueError as e: + msg.fail(invalid_err, e, exits=1) errors = validate(ProjectConfigSchema, config) if errors: - msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) + msg.fail(invalid_err, "\n".join(errors), exits=1) return config @@ -490,8 +492,7 @@ def update_dvc_config( # commands in project.yml and should be run in sequence config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} for name in config.get("run", []): - if name not in config_commands: - msg.fail(f"Can't find command '{name}' in project config", exits=1) + validate_subcommand(config_commands.keys(), name) command = config_commands[name] deps = command.get("deps", []) outputs = command.get("outputs", []) @@ -634,6 +635,20 @@ def check_clone(name: str, dest: Path, repo: str) -> None: ) +def validate_subcommand(commands: Sequence[str], subcommand: str) -> None: + """Check that a subcommand is valid and defined. Raises an error otherwise. + + commands (Sequence[str]): The available commands. + subcommand (str): The subcommand. + """ + if subcommand not in commands: + msg.fail( + f"Can't find command '{subcommand}' in {CONFIG_FILE}. " + f"Available commands: {', '.join(commands)}", + exits=1, + ) + + def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: """Download a file using requests. From 126050f259c3bfbef43129bc156c0a6ac381d1f5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 29 Jun 2020 16:55:24 +0200 Subject: [PATCH 4/7] Improve asset fetching Get all paths first and run dvc add once so it only shows one progress bar and one combined git command (if repo is git repo) --- spacy/cli/project.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index c02c1cf98..861a48f4b 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -295,15 +295,21 @@ def project_assets(project_dir: Path) -> None: msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") variables = config.get("variables", {}) + fetched_assets = [] for asset in assets: url = asset["url"].format(**variables) dest = asset["dest"].format(**variables) - fetch_asset(project_path, url, dest, asset.get("checksum")) + fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum")) + if fetched_path: + fetched_assets.append(str(fetched_path)) + if fetched_assets: + with working_dir(project_path): + run_command(["dvc", "add", *fetched_assets, "--external"]) def fetch_asset( project_path: Path, url: str, dest: Path, checksum: Optional[str] = None -) -> None: +) -> Optional[Path]: """Fetch an asset from a given URL or path. Will try to import the file using DVC's import-url if possible (fully tracked and versioned) and falls back to get-url (versioned) and a non-DVC download if necessary. If a @@ -313,6 +319,8 @@ def fetch_asset( project_path (Path): Path to project directory. url (str): URL or path to asset. checksum (Optional[str]): Optional expected checksum of local file. + RETURNS (Optional[Path]): The path to the fetched asset or None if fetching + the asset failed. """ url = convert_asset_url(url) dest_path = (project_path / dest).resolve() @@ -321,8 +329,7 @@ def fetch_asset( # TODO: add support for caches (dvc import-url with local path) if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") - return - dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"] + return dest_path with working_dir(project_path): try: # If these fail, we don't want to output an error or info message. @@ -334,16 +341,16 @@ def fetch_asset( except subprocess.CalledProcessError: dvc_cmd = ["dvc", "get-url", url, str(dest_path)] print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) - run_command(dvc_add_cmd) except subprocess.CalledProcessError: try: download_file(url, dest_path) except requests.exceptions.HTTPError as e: msg.fail(f"Download failed: {dest}", e) - run_command(dvc_add_cmd) + return None if checksum and checksum != get_checksum(dest_path): msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") msg.good(f"Fetched asset {dest}") + return dest_path def project_run_all(project_dir: Path, *dvc_args) -> None: From 1d2c646e579f0e018e2eed363cc0a0240dde8d47 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 29 Jun 2020 20:07:21 +0200 Subject: [PATCH 5/7] Fix init and remove .dvc/plots --- spacy/cli/project.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 861a48f4b..5382e5bdc 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -21,6 +21,7 @@ from ..util import get_hash, get_checksum CONFIG_FILE = "project.yml" DVC_CONFIG = "dvc.yaml" +DVC_DIR = ".dvc" DIRS = [ "assets", "metas", @@ -88,6 +89,7 @@ def project_clone_cli( def project_init_cli( path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), + force: bool = Opt(False, "--force", "-F", help="Force initiziation"), ): """Initialize a project directory with DVC and optionally Git. This should typically be taken care of automatically when you run the "project clone" @@ -95,7 +97,7 @@ def project_init_cli( be a Git repo, it should be initialized with Git first, before initializing DVC. This allows DVC to integrate with Git. """ - project_init(path, git=git, silent=True) + project_init(path, git=git, force=force, silent=True) @project_cli.command("assets") @@ -246,7 +248,7 @@ def project_clone( if not dir_path.exists(): dir_path.mkdir(parents=True) if not no_init: - project_init(project_dir, git=git, silent=True) + project_init(project_dir, git=git, force=True, silent=True) msg.good(f"Your project is now ready!", dest) print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") @@ -255,6 +257,7 @@ def project_init( project_dir: Path, *, git: bool = False, + force: bool = False, silent: bool = False, analytics: bool = False, ): @@ -265,19 +268,29 @@ def project_init( silent (bool): Don't print any output (via DVC). analytics (bool): Opt-in to DVC analytics (defaults to False). """ + project_dir = project_dir.resolve() with working_dir(project_dir): + if git: + run_command(["git", "init"]) init_cmd = ["dvc", "init"] if silent: init_cmd.append("--quiet") if not git: init_cmd.append("--no-scm") - if git: - run_command(["git", "init"]) + if force: + init_cmd.append("--force") run_command(init_cmd) # We don't want to have analytics on by default – our users should # opt-in explicitly. If they want it, they can always enable it. if not analytics: run_command(["dvc", "config", "core.analytics", "false"]) + # Remove unused and confusing plot templates from .dvc directory + # TODO: maybe we shouldn't do this, but it's otherwise super confusing + # once you commit your changes via Git and it creates a bunch of files + # that have no purpose + plots_dir = project_dir / DVC_DIR / "plots" + if plots_dir.exists(): + shutil.rmtree(str(plots_dir)) config = load_project_config(project_dir) setup_check_dvc(project_dir, config) From c874dde66c84d9b162c87e09d7a84548c3c36f6e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 29 Jun 2020 20:11:34 +0200 Subject: [PATCH 6/7] Show help on "spacy project" --- spacy/cli/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 5382e5bdc..27dc83e29 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -49,7 +49,7 @@ Version Control) to manage input and output files and to ensure steps are only re-run if their inputs change. """ -project_cli = typer.Typer(help=CLI_HELP) +project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True) @project_cli.callback(invoke_without_command=True) From e8033df81ecde6baec2e7f69027827e4ee726939 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 29 Jun 2020 20:30:42 +0200 Subject: [PATCH 7/7] Also handle python3 and pip3 --- spacy/cli/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/project.py b/spacy/cli/project.py index 27dc83e29..1bb92f749 100644 --- a/spacy/cli/project.py +++ b/spacy/cli/project.py @@ -595,9 +595,9 @@ def run_commands( command = command.format(**variables) command = shlex.split(command) # TODO: is this needed / a good idea? - if len(command) and command[0] == "python": + if len(command) and command[0] in ("python", "python3"): command[0] = sys.executable - elif len(command) and command[0] == "pip": + elif len(command) and command[0] in ("pip", "pip3"): command = [sys.executable, "-m", "pip", *command[1:]] if not silent: print(" ".join(command))