Tidy up, document and add custom clone logic

This commit is contained in:
Ines Montani 2020-06-28 15:08:35 +02:00
parent dc7a9be9f8
commit 2f6ee0d018
3 changed files with 352 additions and 222 deletions

View File

@ -1,62 +0,0 @@
import tempfile
import typer
from pathlib import Path
import subprocess
import shlex
import shutil
from contextlib import contextmanager
@contextmanager
def make_tempdir():
d = Path(tempfile.mkdtemp())
yield d
shutil.rmtree(str(d))
def clone_repo(repo, temp_dir):
subprocess.check_call([
"git",
"clone",
repo,
temp_dir,
"--no-checkout",
"--depth", "1",
"--config", "core.sparseCheckout=true"
])
def checkout_and_fetch(temp_dir):
subprocess.check_call([
"git",
"-C", temp_dir,
"fetch"
])
subprocess.check_call([
"git",
"-C", temp_dir,
"checkout"
])
def set_sparse_checkout_dir(temp_dir, subpath):
with (temp_dir / ".git" / "info" / "sparse-checkout").open("w") as file_:
file_.write(subpath)
def main(repo: str, subpath: str, dest: Path):
with make_tempdir() as temp_dir:
clone_repo(repo, temp_dir)
print("After clone", list(temp_dir.iterdir()))
set_sparse_checkout_dir(temp_dir, subpath)
checkout_and_fetch(temp_dir)
print("After checkout", list(temp_dir.iterdir()))
assert (temp_dir / subpath) in list(temp_dir.iterdir())
shutil.copytree(temp_dir / subpath, dest / subpath, dirs_exist_ok=True)
print("Exists after cleanup?", temp_dir.exists())
print("Destination", list(dest.iterdir()))
if __name__ == "__main__":
typer.run(main)

View File

@ -9,12 +9,12 @@ import os
import re
import shutil
import sys
import hashlib
from ._app import app, Arg, Opt, COMMAND, NAME
from .. import about
from ..schemas import ProjectConfigSchema, validate
from ..util import ensure_path, run_command, make_tempdir, working_dir
from ..util import get_hash, get_checksum
CONFIG_FILE = "project.yml"
@ -45,19 +45,13 @@ project_cli = typer.Typer(help="Command-line interface for spaCy projects")
@project_cli.callback(invoke_without_command=True)
def callback(ctx: typer.Context):
"""This runs before every project command and ensures DVC is installed and
everything is up to date.
"""
try:
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
"""This runs before every project command and ensures DVC is installed."""
ensure_dvc()
################
# CLI COMMANDS #
################
@project_cli.command("clone")
@ -68,13 +62,144 @@ def project_clone_cli(
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
verbose: bool = Opt(False, "--verbose", "-V", help="Show detailed information")
# fmt: on
):
"""Clone a project template from a repository."""
project_clone(
name, dest, repo=repo, git=git, no_init=no_init, verbose=verbose, silent=True
)
"""Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo
defaults to the official spaCy template repo, but can be customized
(including using a private repo). Setting the --git flag will also
initialize the project directory as a Git repo. If the project is intended
to be a Git repo, it should be initialized with Git first, before
initializing DVC (Data Version Control). This allows DVC to integrate with
Git.
"""
project_clone(name, dest, repo=repo, git=git, no_init=no_init)
@project_cli.command("init")
def project_init_cli(
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
):
"""Initialize a project directory with DVC and optionally Git. This should
typically be taken care of automatically when you run the "project clone"
command, but you can also run it separately. If the project is intended to
be a Git repo, it should be initialized with Git first, before initializing
DVC. This allows DVC to integrate with Git.
"""
project_init(path, git=git, silent=True)
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
# fmt: on
):
"""Use DVC (Data Version Control) to fetch the assets for the project,
defined in the "assets" section of the project config. If possible, DVC
will try to track the files so you can pull changes from upstream. It will
also try and store the checksum so the assets are versioned. If th file
can't be tracked or checked, it will be downloaded using curl. If a checksum
is provided in the project config, the file is only downloaded if no local
file with the same checksum exists.
"""
project_assets(project_dir)
@project_cli.command(
"run-all",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_run_all_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run all commands defined in the project. This command will use DVC and
the defined outputs and dependencies in the project config to determine
which steps need to be re-run and where to start. This means you're only
re-generating data if the inputs have changed.
This command calls into "dvc repro" and all additional arguments are passed
to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
"""
if show_help:
print_run_help(project_dir)
else:
project_run_all(project_dir, *ctx.args)
@project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_run_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(None, help="Name of command defined in project config"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run a named script defined in the project config. If the command is
part of the default pipeline defined in the "run" section, DVC is used to
determine whether the step should re-run if its inputs have changed, or
whether everything is up to date. If the script is not part of the default
pipeline, it will be called separately without DVC.
If DVC is used, the command calls into "dvc repro" and all additional
arguments are passed to the "dvc repro" command:
https://dvc.org/doc/command-reference/repro
"""
if show_help or not subcommand:
print_run_help(project_dir, subcommand)
else:
project_run(project_dir, subcommand, *ctx.args)
@project_cli.command("exec", hidden=True)
def project_exec_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(..., help="Name of command defined in project config"),
# fmt: on
):
"""Execute a command defined in the project config. This CLI command is
only called internally in auto-generated DVC pipelines, as a shortcut for
multi-step commands in the project config. You typically shouldn't have to
call it yourself. To run a command, call "run" or "run-all".
"""
project_exec(project_dir, subcommand)
@project_cli.command("update-dvc")
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
"""Update the auto-generated DVC config file. Uses the steps defined in the
"run" section of the project config. This typically happens automatically
when running a command, but can also be triggered manually if needed.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
if updated:
msg.good(f"Updated DVC config from {CONFIG_FILE}")
else:
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
app.add_typer(project_cli, name="project")
#################
# CLI FUNCTIONS #
#################
def project_clone(
@ -84,51 +209,55 @@ def project_clone(
repo: str = about.__projects__,
git: bool = False,
no_init: bool = False,
silent: bool = False,
verbose: bool = False,
) -> None:
"""Clone a project template from a repository.
name (str): Name of subdirectory to clone.
dest (Path): Destination path of cloned project.
repo (str): URL of Git repo containing project templates.
git (bool): Initialize project as Git repo. Should be set to True if project
is intended as a repo, since it will allow DVC to integrate with Git.
no_init (bool): Don't initialize DVC and Git automatically. If True, the
"init" command or "git init" and "dvc init" need to be run manually.
"""
dest = ensure_path(dest)
check_clone_dest(dest)
# When cloning a subdirectory with DVC, it will create a folder of that name
# within the destination dir, so we use a tempdir and then copy it into the
# parent directory to create the cloned directory
dest = dest.resolve()
check_clone(name, dest, repo)
project_dir = dest.resolve()
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = ["dvc", "get", repo, name, "-o", str(tmp_dir)]
if verbose:
cmd.append("--verbose")
if silent:
cmd.append("--quiet")
print(" ".join(cmd))
run_command(cmd)
shutil.move(str(tmp_dir / Path(name).name), str(dest))
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
run_command(shlex.split(cmd))
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(name)
run_command(["git", "-C", tmp_dir, "fetch"])
run_command(["git", "-C", tmp_dir, "checkout"])
shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
msg.good(f"Cloned project '{name}' from {repo}")
for sub_dir in DIRS:
dir_path = dest / sub_dir
dir_path = project_dir / sub_dir
if not dir_path.exists():
dir_path.mkdir(parents=True)
if not no_init:
project_init(dest, git=git, silent=silent)
project_init(project_dir, git=git, silent=True)
msg.good(f"Your project is now ready!", dest)
print(f"To fetch the assets, run:\npython -m {NAME} project assets {dest}")
@project_cli.command("init")
def project_init_cli(
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
):
"""Initialize a project directory with DVC and Git (optional). This should
typically be taken care of automatically when you run the "project clone"
command.
"""
project_init(path, git=git, silent=True)
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
def project_init(
dest: Path, *, git: bool = False, silent: bool = False, analytics: bool = False
project_dir: Path,
*,
git: bool = False,
silent: bool = False,
analytics: bool = False,
):
with working_dir(dest):
"""Initialize a project as a DVC and (optionally) as a Git repo.
project_dir (Path): Path to project directory.
git (bool): Also call "git init" to initialize directory as a Git repo.
silent (bool): Don't print any output (via DVC).
analytics (bool): Opt-in to DVC analytics (defaults to False).
"""
with working_dir(project_dir):
init_cmd = ["dvc", "init"]
if silent:
init_cmd.append("--quiet")
@ -137,25 +266,20 @@ def project_init(
if git:
run_command(["git", "init"])
run_command(init_cmd)
# We don't want to have analytics on by default our users should
# opt-in explicitly. If they want it, they can always enable it.
if not analytics:
# TODO: find a better solution for this?
run_command(["dvc", "config", "core.analytics", "false"])
config = load_project_config(dest)
setup_check_dvc(dest, config)
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
@project_cli.command("assets")
def project_assets_cli(
# fmt: off
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
# fmt: on
):
"""Use Data Version Control to get the assets for the project."""
project_assets(path)
def project_assets(project_dir: Path) -> None:
"""Fetch assets for a project using DVC if possible.
def project_assets(project_path: Path) -> None:
project_path = ensure_path(project_path)
project_dir (Path): Path to project directory.
"""
project_path = ensure_path(project_dir)
config = load_project_config(project_path)
setup_check_dvc(project_path, config)
assets = config.get("assets", {})
@ -172,7 +296,17 @@ def project_assets(project_path: Path) -> None:
def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> None:
check_asset(url)
"""Fetch an asset from a given URL or path. Will try to import the file
using DVC's import-url if possible (fully tracked and versioned) and falls
back to get-url (versioned) and a non-DVC download if necessary. If a
checksum is provided and a local file exists, it's only re-downloaded if the
checksum doesn't match.
project_path (Path): Path to project directory.
url (str): URL or path to asset.
checksum (Optional[str]): Optional expected checksum of local file.
"""
url = convert_asset_url(url)
dest_path = (project_path / dest).resolve()
if dest_path.exists() and checksum:
# If there's already a file, check for checksum
@ -185,12 +319,13 @@ def fetch_asset(
# If these fail, we don't want to output an error or info message.
# Try with tracking the source first, then just downloading with
# DVC, then a regular non-DVC download.
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
except subprocess.CalledProcessError:
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
run_command(["dvc", "add", str(dest_path)])
try:
dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
except subprocess.CalledProcessError:
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
run_command(["dvc", "add", str(dest_path)])
except subprocess.CalledProcessError:
# TODO: replace curl
run_command(["curl", url, "--output", str(dest_path), "--progress-bar"])
@ -200,25 +335,12 @@ def fetch_asset(
msg.good(f"Fetched asset {dest}")
@project_cli.command(
"run-all",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_run_all_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run all commands. Additional arguments are passed to dvc repro."""
if show_help:
print_run_help(project_dir)
else:
project_run_all(project_dir, *ctx.args)
def project_run_all(project_dir: Path, *dvc_args) -> None:
"""Run all commands defined in the project using DVC.
project_dir (Path): Path to project directory.
*dvc_args: Other arguments passed to "dvc repro".
"""
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
dvc_cmd = ["dvc", "repro", *dvc_args]
@ -226,27 +348,16 @@ def project_run_all(project_dir: Path, *dvc_args) -> None:
run_command(dvc_cmd)
@project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_run_cli(
# fmt: off
ctx: typer.Context,
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(None, help="Name of command defined in project config"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run scripts defined in the project."""
if show_help or not subcommand:
print_run_help(project_dir, subcommand)
else:
project_run(project_dir, subcommand, *ctx.args)
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
"""Simulate a CLI help prompt using the info available in the project config."""
"""Simulate a CLI help prompt using the info available in the project config.
project_dir (Path): The project directory.
subcommand (Optional[str]): The subcommand or None. If a subcommand is
provided, the subcommand help is shown. Otherwise, the top-level help
and a list of available commands is printed.
"""
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
config_commands = config.get("commands", [])
commands = {cmd["name"]: cmd for cmd in config_commands}
if subcommand:
@ -260,9 +371,20 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
print(f"\nAvailable commands in {CONFIG_FILE}")
print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
msg.text("Run all commands defined in the 'run' block of the project config:")
print(f"{COMMAND} project run-all {project_dir}")
def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
"""Run a named script defined in the project config. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to
execute the command, so it can determine whether to rerun it. It then
calls into "exec" to execute it.
project_dir (Path): Path to project directory.
subcommand (str): Name of command to run.
*dvc_args: Other arguments passed to "dvc repro".
"""
config = load_project_config(project_dir)
setup_check_dvc(project_dir, config)
config_commands = config.get("commands", [])
@ -286,18 +408,12 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
run_commands(cmd["script"], variables)
@project_cli.command("exec")
def project_exec_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
subcommand: str = Arg(..., help="Name of command defined in project config"),
# fmt: on
):
"""Internals"""
project_exec(project_dir, subcommand)
def project_exec(project_dir: Path, subcommand: str):
"""Execute a command defined in the project config.
project_dir (Path): Path to project directory.
subcommand (str): Name of command to run.
"""
config = load_project_config(project_dir)
config_commands = config.get("commands", [])
variables = config.get("variables", {})
@ -306,26 +422,17 @@ def project_exec(project_dir: Path, subcommand: str):
run_commands(commands[subcommand]["script"], variables)
@project_cli.command("update-dvc")
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
config = load_project_config(project_dir)
updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
if updated:
msg.good(f"Updated DVC config from {CONFIG_FILE}")
else:
msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
app.add_typer(project_cli, name="project")
###########
# HELPERS #
###########
def load_project_config(path: Path) -> Dict[str, Any]:
"""Load the project config file from a directory and validate it.
path (Path): The path to the project directory.
RETURNS (Dict[str, Any]): The loaded project config.
"""
config_path = path / CONFIG_FILE
if not config_path.exists():
msg.fail("Can't find project config", config_path, exits=1)
@ -343,8 +450,17 @@ def update_dvc_config(
silent: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yml file in the
project directory. The file is auto-generated based on the config.
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
project directory. The file is auto-generated based on the config. The
first line of the auto-generated file specifies the hash of the config
dict, so if any of the config values change, the DVC config is regenerated.
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project config.
verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
config_hash = get_hash(config)
path = path.resolve()
@ -392,11 +508,40 @@ def update_dvc_config(
return True
def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None:
if not (project_path / ".dvc").exists():
msg.fail("Project not initialized as a DVC project", exits=1)
def ensure_dvc() -> None:
"""Ensure that the "dvc" command is available and show an error if not."""
try:
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
"spaCy projects require DVC (Data Version Control) and the 'dvc' command",
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
"""Check that the project is set up correctly with DVC and update its
config if needed. Will raise an error if the project is not an initialized
DVC project.
project_dir (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project config.
"""
if not project_dir.exists():
msg.fail(f"Can't find project directory: {project_dir}")
if not (project_dir / ".dvc").exists():
msg.fail(
"Project not initialized as a DVC project.",
f"Make sure that the project template was cloned correctly. To "
f"initialize the project directory manually, you can run: "
f"{COMMAND} project init {project_dir}",
exits=1,
)
with msg.loading("Updating DVC config..."):
updated = update_dvc_config(project_path, config, silent=True)
updated = update_dvc_config(project_dir, config, silent=True)
if updated:
msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
@ -404,6 +549,14 @@ def setup_check_dvc(project_path: Path, config: Dict[str, Any]) -> None:
def run_commands(
commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The split commands.
variables (Dict[str, str]): Dictionary of variable names, mapped to their
values. Will be used to substitute format string variables in the
commands.
silent (boll): Don't print the commands.
"""
for command in commands:
# Substitute variables, e.g. "./{NAME}.json"
command = command.format(**variables)
@ -418,21 +571,44 @@ def run_commands(
run_command(command)
def check_asset(url: str) -> None:
def convert_asset_url(url: str) -> str:
"""Check and convert the asset URL if needed.
url (str): The asset URL.
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
# TODO: support loading from GitHub URLs? Automatically convert to raw?
if re.match("(http(s?)):\/\/github.com", url):
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(
"Downloading from a regular GitHub URL. This will only download "
"the source of the page, not the actual file. If you want to "
"download the raw file, click on 'Download' on the GitHub page "
"and copy the raw.githubusercontent.com URL instead."
"the source of the page, not the actual file. Converting the URL "
"to a raw URL.",
converted,
)
# url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/").replace("/tree/", "/")
return converted
return url
def check_clone_dest(dest: Path) -> None:
"""Check and validate that the destination path can be used to clone."""
def check_clone(name: str, dest: Path, repo: str) -> None:
"""Check and validate that the destination path can be used to clone. Will
check that Git is available and that the destination path is suitable.
name (str): Name of the directory to clone from the repo.
dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from.
"""
try:
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
f"Cloning spaCy project templates requires Git and the 'git' command. ",
f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually and then run:",
f"{COMMAND} project init {dest}",
exits=1,
)
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():
@ -444,12 +620,3 @@ def check_clone_dest(dest: Path) -> None:
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
exits=1,
)
def get_hash(data) -> str:
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest()
def get_checksum(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()

View File

@ -21,6 +21,7 @@ import subprocess
from contextlib import contextmanager
import tempfile
import shutil
import hashlib
try:
@ -459,11 +460,35 @@ def working_dir(path: Union[str, Path]) -> None:
@contextmanager
def make_tempdir():
"""Execute a block in a temporary directory and remove the directory and
its contents at the end of the with block.
YIELDS (Path): The path of the temp directory.
"""
d = Path(tempfile.mkdtemp())
yield d
shutil.rmtree(str(d))
def get_hash(data) -> str:
"""Get the hash for a JSON-serializable object.
data: The data to hash.
RETURNS (str): The hash.
"""
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest()
def get_checksum(path: Union[Path, str]) -> str:
"""Get the checksum for a file given its file path.
path (Union[Path, str]): The file path.
RETURNS (str): The checksum.
"""
return hashlib.md5(Path(path).read_bytes()).hexdigest()
def is_in_jupyter():
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.