Merge remote-tracking branch 'upstream/develop' into fix/small-edits

# Conflicts:
#	spacy/cli/project.py
This commit is contained in:
svlandeg 2020-06-30 11:17:31 +02:00
commit b311ce982f
3 changed files with 63 additions and 26 deletions

View File

@ -16,6 +16,7 @@ from .convert import convert # noqa: F401
from .init_model import init_model # noqa: F401 from .init_model import init_model # noqa: F401
from .validate import validate # noqa: F401 from .validate import validate # noqa: F401
from .project import project_clone, project_assets, project_run # noqa: F401 from .project import project_clone, project_assets, project_run # noqa: F401
from .project import project_run_all # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -1,4 +1,4 @@
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional, Sequence
import typer import typer
import srsly import srsly
from pathlib import Path from pathlib import Path
@ -22,6 +22,7 @@ from ..util import get_hash, get_checksum
CONFIG_FILE = "project.yml" CONFIG_FILE = "project.yml"
DVC_CONFIG = "dvc.yaml" DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"
DIRS = [ DIRS = [
"assets", "assets",
"metas", "metas",
@ -49,7 +50,7 @@ Version Control) to manage input and output files and to ensure steps are only
re-run if their inputs change. re-run if their inputs change.
""" """
project_cli = typer.Typer(help=CLI_HELP) project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
@project_cli.callback(invoke_without_command=True) @project_cli.callback(invoke_without_command=True)
@ -91,6 +92,7 @@ def project_clone_cli(
def project_init_cli( def project_init_cli(
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
): ):
"""Initialize a project directory with DVC and optionally Git. This should """Initialize a project directory with DVC and optionally Git. This should
typically be taken care of automatically when you run the "project clone" typically be taken care of automatically when you run the "project clone"
@ -98,7 +100,7 @@ def project_init_cli(
be a Git repo, it should be initialized with Git first, before initializing be a Git repo, it should be initialized with Git first, before initializing
DVC. This allows DVC to integrate with Git. DVC. This allows DVC to integrate with Git.
""" """
project_init(path, git=git, silent=True) project_init(path, git=git, force=force, silent=True)
@project_cli.command("assets") @project_cli.command("assets")
@ -252,7 +254,7 @@ def project_clone(
if not dir_path.exists(): if not dir_path.exists():
dir_path.mkdir(parents=True) dir_path.mkdir(parents=True)
if not no_init: if not no_init:
project_init(project_dir, git=git, silent=True) project_init(project_dir, git=git, force=True, silent=True)
msg.good(f"Your project is now ready!", dest) msg.good(f"Your project is now ready!", dest)
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
@ -261,6 +263,7 @@ def project_init(
project_dir: Path, project_dir: Path,
*, *,
git: bool = False, git: bool = False,
force: bool = False,
silent: bool = False, silent: bool = False,
analytics: bool = False, analytics: bool = False,
): ):
@ -271,19 +274,29 @@ def project_init(
silent (bool): Don't print any output (via DVC). silent (bool): Don't print any output (via DVC).
analytics (bool): Opt-in to DVC analytics (defaults to False). analytics (bool): Opt-in to DVC analytics (defaults to False).
""" """
project_dir = project_dir.resolve()
with working_dir(project_dir): with working_dir(project_dir):
if git:
run_command(["git", "init"])
init_cmd = ["dvc", "init"] init_cmd = ["dvc", "init"]
if silent: if silent:
init_cmd.append("--quiet") init_cmd.append("--quiet")
if not git: if not git:
init_cmd.append("--no-scm") init_cmd.append("--no-scm")
if git: if force:
run_command(["git", "init"]) init_cmd.append("--force")
run_command(init_cmd) run_command(init_cmd)
# We don't want to have analytics on by default our users should # We don't want to have analytics on by default our users should
# opt-in explicitly. If they want it, they can always enable it. # opt-in explicitly. If they want it, they can always enable it.
if not analytics: if not analytics:
run_command(["dvc", "config", "core.analytics", "false"]) run_command(["dvc", "config", "core.analytics", "false"])
# Remove unused and confusing plot templates from .dvc directory
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
# once you commit your changes via Git and it creates a bunch of files
# that have no purpose
plots_dir = project_dir / DVC_DIR / "plots"
if plots_dir.exists():
shutil.rmtree(str(plots_dir))
config = load_project_config(project_dir) config = load_project_config(project_dir)
setup_check_dvc(project_dir, config) setup_check_dvc(project_dir, config)
@ -301,15 +314,21 @@ def project_assets(project_dir: Path) -> None:
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0) msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)") msg.info(f"Fetching {len(assets)} asset(s)")
variables = config.get("variables", {}) variables = config.get("variables", {})
fetched_assets = []
for asset in assets: for asset in assets:
url = asset["url"].format(**variables) url = asset["url"].format(**variables)
dest = asset["dest"].format(**variables) dest = asset["dest"].format(**variables)
fetch_asset(project_path, url, dest, asset.get("checksum")) fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
if fetched_path:
fetched_assets.append(str(fetched_path))
if fetched_assets:
with working_dir(project_path):
run_command(["dvc", "add", *fetched_assets, "--external"])
def fetch_asset( def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> None: ) -> Optional[Path]:
"""Fetch an asset from a given URL or path. Will try to import the file """Fetch an asset from a given URL or path. Will try to import the file
using DVC's import-url if possible (fully tracked and versioned) and falls using DVC's import-url if possible (fully tracked and versioned) and falls
back to get-url (versioned) and a non-DVC download if necessary. If a back to get-url (versioned) and a non-DVC download if necessary. If a
@ -319,6 +338,8 @@ def fetch_asset(
project_path (Path): Path to project directory. project_path (Path): Path to project directory.
url (str): URL or path to asset. url (str): URL or path to asset.
checksum (Optional[str]): Optional expected checksum of local file. checksum (Optional[str]): Optional expected checksum of local file.
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
the asset failed.
""" """
url = convert_asset_url(url) url = convert_asset_url(url)
dest_path = (project_path / dest).resolve() dest_path = (project_path / dest).resolve()
@ -327,8 +348,7 @@ def fetch_asset(
# TODO: add support for caches (dvc import-url with local path) # TODO: add support for caches (dvc import-url with local path)
if checksum == get_checksum(dest_path): if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}") msg.good(f"Skipping download with matching checksum: {dest}")
return return dest_path
dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"]
with working_dir(project_path): with working_dir(project_path):
try: try:
# If these fail, we don't want to output an error or info message. # If these fail, we don't want to output an error or info message.
@ -340,16 +360,16 @@ def fetch_asset(
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
dvc_cmd = ["dvc", "get-url", url, str(dest_path)] dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL)) print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
run_command(dvc_add_cmd)
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
try: try:
download_file(url, dest_path) download_file(url, dest_path)
except requests.exceptions.HTTPError as e: except requests.exceptions.HTTPError as e:
msg.fail(f"Download failed: {dest}", e) msg.fail(f"Download failed: {dest}", e)
run_command(dvc_add_cmd) return None
if checksum and checksum != get_checksum(dest_path): if checksum and checksum != get_checksum(dest_path):
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}") msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
msg.good(f"Fetched asset {dest}") msg.good(f"Fetched asset {dest}")
return dest_path
def project_run_all(project_dir: Path, *dvc_args) -> None: def project_run_all(project_dir: Path, *dvc_args) -> None:
@ -378,8 +398,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
config_commands = config.get("commands", []) config_commands = config.get("commands", [])
commands = {cmd["name"]: cmd for cmd in config_commands} commands = {cmd["name"]: cmd for cmd in config_commands}
if subcommand: if subcommand:
if subcommand not in commands: validate_subcommand(commands.keys(), subcommand)
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") print(f"Usage: {COMMAND} project run {project_dir} {subcommand}")
help_text = commands[subcommand].get("help") help_text = commands[subcommand].get("help")
if help_text: if help_text:
@ -407,8 +426,7 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
config_commands = config.get("commands", []) config_commands = config.get("commands", [])
variables = config.get("variables", {}) variables = config.get("variables", {})
commands = {cmd["name"]: cmd for cmd in config_commands} commands = {cmd["name"]: cmd for cmd in config_commands}
if subcommand not in commands: validate_subcommand(commands.keys(), subcommand)
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
if subcommand in config.get("run", []): if subcommand in config.get("run", []):
# This is one of the pipeline commands tracked in DVC # This is one of the pipeline commands tracked in DVC
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args] dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
@ -454,10 +472,14 @@ def load_project_config(path: Path) -> Dict[str, Any]:
config_path = path / CONFIG_FILE config_path = path / CONFIG_FILE
if not config_path.exists(): if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1) msg.fail("Can't find project config", config_path, exits=1)
invalid_err = f"Invalid project config in {CONFIG_FILE}"
try:
config = srsly.read_yaml(config_path) config = srsly.read_yaml(config_path)
except ValueError as e:
msg.fail(invalid_err, e, exits=1)
errors = validate(ProjectConfigSchema, config) errors = validate(ProjectConfigSchema, config)
if errors: if errors:
msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1) msg.fail(invalid_err, "\n".join(errors), exits=1)
return config return config
@ -496,8 +518,7 @@ def update_dvc_config(
# commands in project.yml and should be run in sequence # commands in project.yml and should be run in sequence
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
for name in config.get("run", []): for name in config.get("run", []):
if name not in config_commands: validate_subcommand(config_commands.keys(), name)
msg.fail(f"Can't find command '{name}' in project config", exits=1)
command = config_commands[name] command = config_commands[name]
deps = command.get("deps", []) deps = command.get("deps", [])
outputs = command.get("outputs", []) outputs = command.get("outputs", [])
@ -580,9 +601,9 @@ def run_commands(
command = command.format(**variables) command = command.format(**variables)
command = shlex.split(command, posix=not is_windows) command = shlex.split(command, posix=not is_windows)
# TODO: is this needed / a good idea? # TODO: is this needed / a good idea?
if len(command) and command[0] == "python": if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable command[0] = sys.executable
elif len(command) and command[0] == "pip": elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]] command = [sys.executable, "-m", "pip", *command[1:]]
if not silent: if not silent:
print(" ".join(command)) print(" ".join(command))
@ -640,6 +661,20 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
) )
def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
"""Check that a subcommand is valid and defined. Raises an error otherwise.
commands (Sequence[str]): The available commands.
subcommand (str): The subcommand.
"""
if subcommand not in commands:
msg.fail(
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
f"Available commands: {', '.join(commands)}",
exits=1,
)
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
"""Download a file using requests. """Download a file using requests.

View File

@ -729,12 +729,13 @@ cdef class ArcEager(TransitionSystem):
cdef ArcEagerGold gold_ = gold cdef ArcEagerGold gold_ = gold
gold_.update(stcls) gold_.update(stcls)
gold_state = gold_.c gold_state = gold_.c
n_gold = 0 cdef int n_gold = 0
for i in range(self.n_moves): for i in range(self.n_moves):
if self.c[i].is_valid(stcls.c, self.c[i].label): if self.c[i].is_valid(stcls.c, self.c[i].label):
is_valid[i] = True is_valid[i] = True
costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label) costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
n_gold += costs[i] <= 0 if costs[i] <= 0:
n_gold += 1
else: else:
is_valid[i] = False is_valid[i] = False
costs[i] = 9000 costs[i] = 9000