Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthw Honnibal 2020-07-01 01:03:39 +02:00
commit 35af5819e0
3 changed files with 76 additions and 37 deletions

View File

@ -4,7 +4,6 @@ import srsly
from pathlib import Path
from wasabi import msg
import subprocess
import shlex
import os
import re
import shutil
@ -16,7 +15,7 @@ from ._app import app, Arg, Opt, COMMAND, NAME
from .. import about
from ..schemas import ProjectConfigSchema, validate
from ..util import ensure_path, run_command, make_tempdir, working_dir
from ..util import get_hash, get_checksum
from ..util import get_hash, get_checksum, split_command
CONFIG_FILE = "project.yml"
@ -82,14 +81,18 @@ def project_clone_cli(
initializing DVC (Data Version Control). This allows DVC to integrate with
Git.
"""
if dest == Path.cwd():
dest = dest / name
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
@project_cli.command("init")
def project_init_cli(
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
# fmt: off
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
# fmt: on
):
"""Initialize a project directory with DVC and optionally Git. This should
typically be taken care of automatically when you run the "project clone"
@ -103,13 +106,13 @@ def project_init_cli(
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Use DVC (Data Version Control) to fetch project assets. Assets are
defined in the "assets" section of the project config. If possible, DVC
will try to track the files so you can pull changes from upstream. It will
also try and store the checksum so the assets are versioned. If th file
also try and store the checksum so the assets are versioned. If the file
can't be tracked or checked, it will be downloaded without DVC. If a checksum
is provided in the project config, the file is only downloaded if no local
file with the same checksum exists.
@ -124,7 +127,7 @@ def project_assets_cli(
def project_run_all_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
@ -148,8 +151,8 @@ def project_run_all_cli(
def project_run_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(None, help="Name of command defined in project config"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
@ -172,8 +175,8 @@ def project_run_cli(
@project_cli.command("exec", hidden=True)
def project_exec_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(..., help="Name of command defined in project config"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Execute a command defined in the project config. This CLI command is
@ -187,7 +190,7 @@ def project_exec_cli(
@project_cli.command("update-dvc")
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
@ -236,13 +239,17 @@ def project_clone(
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
run_command(shlex.split(cmd))
try:
run_command(cmd)
except SystemExit:
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'"
msg.fail(err)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(name)
run_command(["git", "-C", tmp_dir, "fetch"])
run_command(["git", "-C", tmp_dir, "checkout"])
run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"])
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
msg.good(f"Cloned project '{name}' from {repo}")
msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
for sub_dir in DIRS:
dir_path = project_dir / sub_dir
if not dir_path.exists():
@ -268,8 +275,7 @@ def project_init(
silent (bool): Don't print any output (via DVC).
analytics (bool): Opt-in to DVC analytics (defaults to False).
"""
project_dir = project_dir.resolve()
with working_dir(project_dir):
with working_dir(project_dir) as cwd:
if git:
run_command(["git", "init"])
init_cmd = ["dvc", "init"]
@ -288,11 +294,11 @@ def project_init(
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
# once you commit your changes via Git and it creates a bunch of files
# that have no purpose
plots_dir = project_dir / DVC_DIR / "plots"
plots_dir = cwd / DVC_DIR / "plots"
if plots_dir.exists():
shutil.rmtree(str(plots_dir))
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
config = load_project_config(cwd)
setup_check_dvc(cwd, config)
def project_assets(project_dir: Path) -> None:
@ -393,13 +399,13 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
commands = {cmd["name"]: cmd for cmd in config_commands}
if subcommand:
validate_subcommand(commands.keys(), subcommand)
print(f"Usage: {COMMAND} project run {project_dir} {subcommand}")
print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
help_text = commands[subcommand].get("help")
if help_text:
msg.text(f"\n{help_text}\n")
else:
print(f"\nAvailable commands in {CONFIG_FILE}")
print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]")
print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
msg.text("Run all commands defined in the 'run' block of the project config:")
print(f"{COMMAND} project run-all {project_dir}")
@ -500,7 +506,7 @@ def update_dvc_config(
path = path.resolve()
dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists():
# Cneck if the file was generated using the current config, if not, redo
# Check if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force:
@ -521,7 +527,7 @@ def update_dvc_config(
continue
# Default to "." as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "exec", ".", name]
project_cmd = ["python", "-m", NAME, "project", ".", "exec", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
@ -584,23 +590,29 @@ def run_commands(
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The split commands.
commands (List[str]): The string commands.
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
silent (boll): Don't print the commands.
silent (bool): Don't print the commands.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
command = shlex.split(command)
# TODO: is this needed / a good idea?
command = split_command(command)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
# executed with and the pip in the same env, not some other Python/pip.
# Also ensures cross-compatibility if user 1 writes "python3" (because
# that's how it's set up on their system), and user 2 without the
# shortcut tries to re-run the command.
if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable
elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]]
if not silent:
print(" ".join(command))
print(f"Running command: {' '.join(command)}")
run_command(command)

View File

@ -132,6 +132,7 @@ class Warnings(object):
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
# TODO: fix numbering after merging develop into master
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
W093 = ("Could not find any data to train the {name} on. Is your "
"input data correctly formatted ?")
@ -538,6 +539,7 @@ class Errors(object):
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
"array and {doc_length} for the Doc itself.")
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")

View File

@ -22,7 +22,7 @@ from contextlib import contextmanager
import tempfile
import shutil
import hashlib
import shlex
try:
import cupy.random
@ -35,7 +35,7 @@ except ImportError:
import importlib_metadata
from .symbols import ORTH
from .compat import cupy, CudaStream
from .compat import cupy, CudaStream, is_windows
from .errors import Errors, Warnings
from . import about
@ -434,12 +434,30 @@ def get_package_path(name):
return Path(pkg.__file__).parent
def run_command(command: List[str]) -> None:
"""Run a command on the command line as a subprocess.
def split_command(command: str) -> List[str]:
"""Split a string command using shlex. Handles platform compatibility.
command (list): The split command.
command (str) : The command to split
RETURNS (List[str]): The split command.
"""
return shlex.split(command, posix=not is_windows)
def run_command(command: Union[str, List[str]]) -> None:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split.
"""
if isinstance(command, str):
command = split_command(command)
try:
status = subprocess.call(command, env=os.environ.copy())
except FileNotFoundError:
raise FileNotFoundError(
Errors.E970.format(str_command=" ".join(command), tool=command[0])
)
if status != 0:
sys.exit(status)
@ -449,13 +467,17 @@ def working_dir(path: Union[str, Path]) -> None:
"""Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to.
YIELDS (Path): The absolute path to the current working directory. This
should be used if the block needs to perform actions within the working
directory, to prevent mismatches with relative paths.
"""
prev_cwd = Path.cwd()
os.chdir(str(path))
current = Path(path).resolve()
os.chdir(str(current))
try:
yield
yield current
finally:
os.chdir(prev_cwd)
os.chdir(str(prev_cwd))
@contextmanager
@ -467,7 +489,10 @@ def make_tempdir():
"""
d = Path(tempfile.mkdtemp())
yield d
try:
shutil.rmtree(str(d))
except PermissionError as e:
warnings.warn(Warnings.W091.format(dir=d, msg=e))
def get_hash(data) -> str: