Include spaCy version check in project CLI

This commit is contained in:
Ines Montani 2020-10-05 13:53:07 +02:00
parent 20f2a17a09
commit 6958510bda
5 changed files with 79 additions and 8 deletions

View File

@ -1,4 +1,4 @@
from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
import sys import sys
import shutil import shutil
from pathlib import Path from pathlib import Path
@ -193,12 +193,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
) )
def get_hash(data) -> str: def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
"""Get the hash for a JSON-serializable object. """Get the hash for a JSON-serializable object.
data: The data to hash. data: The data to hash.
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
RETURNS (str): The hash. RETURNS (str): The hash.
""" """
if isinstance(data, dict):
data = {k: v for k, v in data.items() if k not in exclude}
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest() return hashlib.md5(data_str).hexdigest()

View File

@ -7,7 +7,8 @@ import tarfile
from pathlib import Path from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir from ...util import make_tempdir, get_minor_version
from ... import about
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import Pathy # noqa: F401
@ -129,7 +130,9 @@ def get_command_hash(
currently installed packages, whatever environment variables have been marked currently installed packages, whatever environment variables have been marked
as relevant, and the command. as relevant, and the command.
""" """
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)] spacy_v = get_minor_version(about.__version__)
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd) hashes.extend(cmd)
creation_bytes = "".join(hashes).encode("utf8") creation_bytes = "".join(hashes).encode("utf8")
return hashlib.md5(creation_bytes).hexdigest() return hashlib.md5(creation_bytes).hexdigest()

View File

@ -4,8 +4,10 @@ from wasabi import msg
import sys import sys
import srsly import srsly
from ... import about
from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList from ...util import SimpleFrozenList, is_minor_version_match
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
@ -63,11 +65,11 @@ def project_run(
err_kwargs = {"exits": 1} if not dry else {} err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, err_help, **err_kwargs) msg.fail(err, err_help, **err_kwargs)
with working_dir(project_dir) as current_dir: with working_dir(project_dir) as current_dir:
msg.divider(subcommand)
rerun = check_rerun(current_dir, cmd) rerun = check_rerun(current_dir, cmd)
if not rerun and not force: if not rerun and not force:
msg.info(f"Skipping '{cmd['name']}': nothing changed") msg.info(f"Skipping '{cmd['name']}': nothing changed")
else: else:
msg.divider(subcommand)
run_commands(cmd["script"], dry=dry) run_commands(cmd["script"], dry=dry)
if not dry: if not dry:
update_lockfile(current_dir, cmd) update_lockfile(current_dir, cmd)
@ -171,12 +173,18 @@ def validate_subcommand(
) )
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: def check_rerun(
project_dir: Path,
command: Dict[str, Any],
check_spacy_version: bool = True,
check_spacy_commit: bool = False,
) -> bool:
"""Check if a command should be rerun because its settings or inputs/outputs """Check if a command should be rerun because its settings or inputs/outputs
changed. changed.
project_dir (Path): The current project directory. project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml. command (Dict[str, Any]): The command, as defined in the project.yml.
strict_version (bool):
RETURNS (bool): Whether to re-run the command. RETURNS (bool): Whether to re-run the command.
""" """
lock_path = project_dir / PROJECT_LOCK lock_path = project_dir / PROJECT_LOCK
@ -189,10 +197,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
# Always run commands with no outputs (otherwise they'd always be skipped) # Always run commands with no outputs (otherwise they'd always be skipped)
if not entry.get("outs", []): if not entry.get("outs", []):
return True return True
# Always rerun if spaCy version or commit hash changed
spacy_v = entry.get("spacy_version")
commit = entry.get("spacy_git_version")
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
return True
if check_spacy_commit and commit != GIT_VERSION:
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
return True
# If the entry in the lockfile matches the lockfile entry that would be # If the entry in the lockfile matches the lockfile entry that would be
# generated from the current command, we don't rerun because it means that # generated from the current command, we don't rerun because it means that
# all inputs/outputs, hashes and scripts are the same and nothing changed # all inputs/outputs, hashes and scripts are the same and nothing changed
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry) lock_entry = get_lock_entry(project_dir, command)
exclude = ["spacy_version", "spacy_git_version"]
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
@ -231,6 +252,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
"script": command["script"], "script": command["script"],
"deps": deps, "deps": deps,
"outs": [*outs, *outs_nc], "outs": [*outs, *outs_nc],
"spacy_version": about.__version__,
"spacy_git_version": GIT_VERSION,
} }

View File

@ -149,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
assert util.is_unconstrained_version(constraint) is expected assert util.is_unconstrained_version(constraint) is expected
@pytest.mark.parametrize(
"a1,a2,b1,b2,is_match",
[
("3.0.0", "3.0", "3.0.1", "3.0", True),
("3.1.0", "3.1", "3.2.1", "3.2", False),
("xxx", None, "1.2.3.dev0", "1.2", False),
],
)
def test_minor_version(a1, a2, b1, b2, is_match):
assert util.get_minor_version(a1) == a2
assert util.get_minor_version(b1) == b2
assert util.is_minor_version_match(a1, b1) is is_match
assert util.is_minor_version_match(a2, b2) is is_match
@pytest.mark.parametrize( @pytest.mark.parametrize(
"dot_notation,expected", "dot_notation,expected",
[ [

View File

@ -584,6 +584,33 @@ def get_base_version(version: str) -> str:
return Version(version).base_version return Version(version).base_version
def get_minor_version(version: str) -> Optional[str]:
"""Get the major + minor version (without patch or prerelease identifiers).
version (str): The version.
RETURNS (str): The major + minor version or None if version is invalid.
"""
try:
v = Version(version)
except (TypeError, InvalidVersion):
return None
return f"{v.major}.{v.minor}"
def is_minor_version_match(version_a: str, version_b: str) -> bool:
"""Compare two versions and check if they match in major and minor, without
patch or prerelease identifiers. Used internally for compatibility checks
that should be insensitive to patch releases.
version_a (str): The first version
version_b (str): The second version.
RETURNS (bool): Whether the versions match.
"""
a = get_minor_version(version_a)
b = get_minor_version(version_b)
return a is not None and b is not None and a == b
def load_meta(path: Union[str, Path]) -> Dict[str, Any]: def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Load a model meta.json from a path and validate its contents. """Load a model meta.json from a path and validate its contents.