Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthw Honnibal 2020-07-01 01:03:39 +02:00
commit 35af5819e0
3 changed files with 76 additions and 37 deletions

View File

@ -4,7 +4,6 @@ import srsly
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import subprocess import subprocess
import shlex
import os import os
import re import re
import shutil import shutil
@ -16,7 +15,7 @@ from ._app import app, Arg, Opt, COMMAND, NAME
from .. import about from .. import about
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import ensure_path, run_command, make_tempdir, working_dir from ..util import ensure_path, run_command, make_tempdir, working_dir
from ..util import get_hash, get_checksum from ..util import get_hash, get_checksum, split_command
CONFIG_FILE = "project.yml" CONFIG_FILE = "project.yml"
@ -82,14 +81,18 @@ def project_clone_cli(
initializing DVC (Data Version Control). This allows DVC to integrate with initializing DVC (Data Version Control). This allows DVC to integrate with
Git. Git.
""" """
if dest == Path.cwd():
dest = dest / name
project_clone(name, dest, repo=repo, git=git, no_init=no_init) project_clone(name, dest, repo=repo, git=git, no_init=no_init)
@project_cli.command("init") @project_cli.command("init")
def project_init_cli( def project_init_cli(
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), # fmt: off
path: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"), git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
force: bool = Opt(False, "--force", "-F", help="Force initiziation"), force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
# fmt: on
): ):
"""Initialize a project directory with DVC and optionally Git. This should """Initialize a project directory with DVC and optionally Git. This should
typically be taken care of automatically when you run the "project clone" typically be taken care of automatically when you run the "project clone"
@ -103,13 +106,13 @@ def project_init_cli(
@project_cli.command("assets") @project_cli.command("assets")
def project_assets_cli( def project_assets_cli(
# fmt: off # fmt: off
project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on # fmt: on
): ):
"""Use DVC (Data Version Control) to fetch project assets. Assets are """Use DVC (Data Version Control) to fetch project assets. Assets are
defined in the "assets" section of the project config. If possible, DVC defined in the "assets" section of the project config. If possible, DVC
will try to track the files so you can pull changes from upstream. It will will try to track the files so you can pull changes from upstream. It will
also try and store the checksum so the assets are versioned. If th file also try and store the checksum so the assets are versioned. If the file
can't be tracked or checked, it will be downloaded without DVC. If a checksum can't be tracked or checked, it will be downloaded without DVC. If a checksum
is provided in the project config, the file is only downloaded if no local is provided in the project config, the file is only downloaded if no local
file with the same checksum exists. file with the same checksum exists.
@ -124,7 +127,7 @@ def project_assets_cli(
def project_run_all_cli( def project_run_all_cli(
# fmt: off # fmt: off
ctx: typer.Context, ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on # fmt: on
): ):
@ -148,8 +151,8 @@ def project_run_all_cli(
def project_run_cli( def project_run_cli(
# fmt: off # fmt: off
ctx: typer.Context, ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(None, help="Name of command defined in project config"), subcommand: str = Arg(None, help="Name of command defined in project config"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on # fmt: on
): ):
@ -172,8 +175,8 @@ def project_run_cli(
@project_cli.command("exec", hidden=True) @project_cli.command("exec", hidden=True)
def project_exec_cli( def project_exec_cli(
# fmt: off # fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(..., help="Name of command defined in project config"), subcommand: str = Arg(..., help="Name of command defined in project config"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on # fmt: on
): ):
"""Execute a command defined in the project config. This CLI command is """Execute a command defined in the project config. This CLI command is
@ -187,7 +190,7 @@ def project_exec_cli(
@project_cli.command("update-dvc") @project_cli.command("update-dvc")
def project_update_dvc_cli( def project_update_dvc_cli(
# fmt: off # fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on # fmt: on
@ -236,13 +239,17 @@ def project_clone(
# We're using Git and sparse checkout to only clone the files we need # We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true" cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
run_command(shlex.split(cmd)) try:
run_command(cmd)
except SystemExit:
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'"
msg.fail(err)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f: with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(name) f.write(name)
run_command(["git", "-C", tmp_dir, "fetch"]) run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", tmp_dir, "checkout"]) run_command(["git", "-C", str(tmp_dir), "checkout"])
shutil.move(str(tmp_dir / Path(name).name), str(project_dir)) shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
msg.good(f"Cloned project '{name}' from {repo}") msg.good(f"Cloned project '{name}' from {repo} into {project_dir}")
for sub_dir in DIRS: for sub_dir in DIRS:
dir_path = project_dir / sub_dir dir_path = project_dir / sub_dir
if not dir_path.exists(): if not dir_path.exists():
@ -268,8 +275,7 @@ def project_init(
silent (bool): Don't print any output (via DVC). silent (bool): Don't print any output (via DVC).
analytics (bool): Opt-in to DVC analytics (defaults to False). analytics (bool): Opt-in to DVC analytics (defaults to False).
""" """
project_dir = project_dir.resolve() with working_dir(project_dir) as cwd:
with working_dir(project_dir):
if git: if git:
run_command(["git", "init"]) run_command(["git", "init"])
init_cmd = ["dvc", "init"] init_cmd = ["dvc", "init"]
@ -288,11 +294,11 @@ def project_init(
# TODO: maybe we shouldn't do this, but it's otherwise super confusing # TODO: maybe we shouldn't do this, but it's otherwise super confusing
# once you commit your changes via Git and it creates a bunch of files # once you commit your changes via Git and it creates a bunch of files
# that have no purpose # that have no purpose
plots_dir = project_dir / DVC_DIR / "plots" plots_dir = cwd / DVC_DIR / "plots"
if plots_dir.exists(): if plots_dir.exists():
shutil.rmtree(str(plots_dir)) shutil.rmtree(str(plots_dir))
config = load_project_config(project_dir) config = load_project_config(cwd)
setup_check_dvc(project_dir, config) setup_check_dvc(cwd, config)
def project_assets(project_dir: Path) -> None: def project_assets(project_dir: Path) -> None:
@ -393,13 +399,13 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
commands = {cmd["name"]: cmd for cmd in config_commands} commands = {cmd["name"]: cmd for cmd in config_commands}
if subcommand: if subcommand:
validate_subcommand(commands.keys(), subcommand) validate_subcommand(commands.keys(), subcommand)
print(f"Usage: {COMMAND} project run {project_dir} {subcommand}") print(f"Usage: {COMMAND} project run {subcommand} {project_dir}")
help_text = commands[subcommand].get("help") help_text = commands[subcommand].get("help")
if help_text: if help_text:
msg.text(f"\n{help_text}\n") msg.text(f"\n{help_text}\n")
else: else:
print(f"\nAvailable commands in {CONFIG_FILE}") print(f"\nAvailable commands in {CONFIG_FILE}")
print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]") print(f"Usage: {COMMAND} project run [COMMAND] {project_dir}")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
msg.text("Run all commands defined in the 'run' block of the project config:") msg.text("Run all commands defined in the 'run' block of the project config:")
print(f"{COMMAND} project run-all {project_dir}") print(f"{COMMAND} project run-all {project_dir}")
@ -500,7 +506,7 @@ def update_dvc_config(
path = path.resolve() path = path.resolve()
dvc_config_path = path / DVC_CONFIG dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists(): if dvc_config_path.exists():
# Cneck if the file was generated using the current config, if not, redo # Check if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f: with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "") ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force: if ref_hash == config_hash and not force:
@ -521,7 +527,7 @@ def update_dvc_config(
continue continue
# Default to "." as the project path since dvc.yaml is auto-generated # Default to "." as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there # and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "exec", ".", name] project_cmd = ["python", "-m", NAME, "project", ".", "exec", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
@ -584,23 +590,29 @@ def run_commands(
) -> None: ) -> None:
"""Run a sequence of commands in a subprocess, in order. """Run a sequence of commands in a subprocess, in order.
commands (List[str]): The split commands. commands (List[str]): The string commands.
variables (Dict[str, str]): Dictionary of variable names, mapped to their variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the values. Will be used to substitute format string variables in the
commands. commands.
silent (boll): Don't print the commands. silent (bool): Don't print the commands.
""" """
for command in commands: for command in commands:
# Substitute variables, e.g. "./{NAME}.json" # Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables) command = command.format(**variables)
command = shlex.split(command) command = split_command(command)
# TODO: is this needed / a good idea? # Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
# executed with and the pip in the same env, not some other Python/pip.
# Also ensures cross-compatibility if user 1 writes "python3" (because
# that's how it's set up on their system), and user 2 without the
# shortcut tries to re-run the command.
if len(command) and command[0] in ("python", "python3"): if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable command[0] = sys.executable
elif len(command) and command[0] in ("pip", "pip3"): elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]] command = [sys.executable, "-m", "pip", *command[1:]]
if not silent: if not silent:
print(" ".join(command)) print(f"Running command: {' '.join(command)}")
run_command(command) run_command(command)

View File

@ -132,6 +132,7 @@ class Warnings(object):
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
W093 = ("Could not find any data to train the {name} on. Is your " W093 = ("Could not find any data to train the {name} on. Is your "
"input data correctly formatted ?") "input data correctly formatted ?")
@ -538,6 +539,7 @@ class Errors(object):
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the " E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
"array and {doc_length} for the Doc itself.") "array and {doc_length} for the Doc itself.")
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.") E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")

View File

@ -22,7 +22,7 @@ from contextlib import contextmanager
import tempfile import tempfile
import shutil import shutil
import hashlib import hashlib
import shlex
try: try:
import cupy.random import cupy.random
@ -35,7 +35,7 @@ except ImportError:
import importlib_metadata import importlib_metadata
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream from .compat import cupy, CudaStream, is_windows
from .errors import Errors, Warnings from .errors import Errors, Warnings
from . import about from . import about
@ -434,12 +434,30 @@ def get_package_path(name):
return Path(pkg.__file__).parent return Path(pkg.__file__).parent
def run_command(command: List[str]) -> None: def split_command(command: str) -> List[str]:
"""Run a command on the command line as a subprocess. """Split a string command using shlex. Handles platform compatibility.
command (list): The split command. command (str) : The command to split
RETURNS (List[str]): The split command.
""" """
status = subprocess.call(command, env=os.environ.copy()) return shlex.split(command, posix=not is_windows)
def run_command(command: Union[str, List[str]]) -> None:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split.
"""
if isinstance(command, str):
command = split_command(command)
try:
status = subprocess.call(command, env=os.environ.copy())
except FileNotFoundError:
raise FileNotFoundError(
Errors.E970.format(str_command=" ".join(command), tool=command[0])
)
if status != 0: if status != 0:
sys.exit(status) sys.exit(status)
@ -449,13 +467,17 @@ def working_dir(path: Union[str, Path]) -> None:
"""Change current working directory and returns to previous on exit. """Change current working directory and returns to previous on exit.
path (str / Path): The directory to navigate to. path (str / Path): The directory to navigate to.
YIELDS (Path): The absolute path to the current working directory. This
should be used if the block needs to perform actions within the working
directory, to prevent mismatches with relative paths.
""" """
prev_cwd = Path.cwd() prev_cwd = Path.cwd()
os.chdir(str(path)) current = Path(path).resolve()
os.chdir(str(current))
try: try:
yield yield current
finally: finally:
os.chdir(prev_cwd) os.chdir(str(prev_cwd))
@contextmanager @contextmanager
@ -467,7 +489,10 @@ def make_tempdir():
""" """
d = Path(tempfile.mkdtemp()) d = Path(tempfile.mkdtemp())
yield d yield d
shutil.rmtree(str(d)) try:
shutil.rmtree(str(d))
except PermissionError as e:
warnings.warn(Warnings.W091.format(dir=d, msg=e))
def get_hash(data) -> str: def get_hash(data) -> str: