From 6958510bdaaa279c8b4f5184bbdbbe6cf3c7cf8a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 5 Oct 2020 13:53:07 +0200 Subject: [PATCH] Include spaCy version check in project CLI --- spacy/cli/_util.py | 7 +++++-- spacy/cli/project/remote_storage.py | 7 +++++-- spacy/cli/project/run.py | 31 +++++++++++++++++++++++++---- spacy/tests/test_misc.py | 15 ++++++++++++++ spacy/util.py | 27 +++++++++++++++++++++++++ 5 files changed, 79 insertions(+), 8 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c959c9861..676a7c8d7 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING +from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING import sys import shutil from pathlib import Path @@ -193,12 +193,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None: ) -def get_hash(data) -> str: +def get_hash(data, exclude: Iterable[str] = tuple()) -> str: """Get the hash for a JSON-serializable object. data: The data to hash. + exclude (Iterable[str]): Top-level keys to exclude if data is a dict. RETURNS (str): The hash. """ + if isinstance(data, dict): + data = {k: v for k, v in data.items() if k not in exclude} data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") return hashlib.md5(data_str).hexdigest() diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index e7e7cbbe8..7e2caa8d7 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -7,7 +7,8 @@ import tarfile from pathlib import Path from .._util import get_hash, get_checksum, download_file, ensure_pathy -from ...util import make_tempdir +from ...util import make_tempdir, get_minor_version +from ... import about if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -129,7 +130,9 @@ def get_command_hash( currently installed packages, whatever environment variables have been marked as relevant, and the command. """ - hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)] + spacy_v = get_minor_version(about.__version__) + dep_checksums = [get_checksum(dep) for dep in sorted(deps)] + hashes = [spacy_v, site_hash, env_hash] + dep_checksums hashes.extend(cmd) creation_bytes = "".join(hashes).encode("utf8") return hashlib.md5(creation_bytes).hexdigest() diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 69c49fba7..94d4371d0 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -4,8 +4,10 @@ from wasabi import msg import sys import srsly +from ... import about +from ...git_info import GIT_VERSION from ...util import working_dir, run_command, split_command, is_cwd, join_command -from ...util import SimpleFrozenList +from ...util import SimpleFrozenList, is_minor_version_match from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import get_checksum, project_cli, Arg, Opt, COMMAND @@ -63,11 +65,11 @@ def project_run( err_kwargs = {"exits": 1} if not dry else {} msg.fail(err, err_help, **err_kwargs) with working_dir(project_dir) as current_dir: + msg.divider(subcommand) rerun = check_rerun(current_dir, cmd) if not rerun and not force: msg.info(f"Skipping '{cmd['name']}': nothing changed") else: - msg.divider(subcommand) run_commands(cmd["script"], dry=dry) if not dry: update_lockfile(current_dir, cmd) @@ -171,12 +173,18 @@ def validate_subcommand( ) -def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: +def check_rerun( + project_dir: Path, + command: Dict[str, Any], + check_spacy_version: bool = True, + check_spacy_commit: bool = False, +) -> bool: """Check if a command should be rerun because its settings or inputs/outputs changed. project_dir (Path): The current project directory. command (Dict[str, Any]): The command, as defined in the project.yml. + strict_version (bool): RETURNS (bool): Whether to re-run the command. """ lock_path = project_dir / PROJECT_LOCK @@ -189,10 +197,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: # Always run commands with no outputs (otherwise they'd always be skipped) if not entry.get("outs", []): return True + # Always rerun if spaCy version or commit hash changed + spacy_v = entry.get("spacy_version") + commit = entry.get("spacy_git_version") + if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__): + info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)" + msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}") + return True + if check_spacy_commit and commit != GIT_VERSION: + info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)" + msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}") + return True # If the entry in the lockfile matches the lockfile entry that would be # generated from the current command, we don't rerun because it means that # all inputs/outputs, hashes and scripts are the same and nothing changed - return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry) + lock_entry = get_lock_entry(project_dir, command) + exclude = ["spacy_version", "spacy_git_version"] + return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude) def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: @@ -231,6 +252,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any] "script": command["script"], "deps": deps, "outs": [*outs, *outs_nc], + "spacy_version": about.__version__, + "spacy_git_version": GIT_VERSION, } diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index bdf54ad6a..b9a0a9d05 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -149,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected): assert util.is_unconstrained_version(constraint) is expected +@pytest.mark.parametrize( + "a1,a2,b1,b2,is_match", + [ + ("3.0.0", "3.0", "3.0.1", "3.0", True), + ("3.1.0", "3.1", "3.2.1", "3.2", False), + ("xxx", None, "1.2.3.dev0", "1.2", False), + ], +) +def test_minor_version(a1, a2, b1, b2, is_match): + assert util.get_minor_version(a1) == a2 + assert util.get_minor_version(b1) == b2 + assert util.is_minor_version_match(a1, b1) is is_match + assert util.is_minor_version_match(a2, b2) is is_match + + @pytest.mark.parametrize( "dot_notation,expected", [ diff --git a/spacy/util.py b/spacy/util.py index 4d68e829c..4b2cb018a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -584,6 +584,33 @@ def get_base_version(version: str) -> str: return Version(version).base_version +def get_minor_version(version: str) -> Optional[str]: + """Get the major + minor version (without patch or prerelease identifiers). + + version (str): The version. + RETURNS (str): The major + minor version or None if version is invalid. + """ + try: + v = Version(version) + except (TypeError, InvalidVersion): + return None + return f"{v.major}.{v.minor}" + + +def is_minor_version_match(version_a: str, version_b: str) -> bool: + """Compare two versions and check if they match in major and minor, without + patch or prerelease identifiers. Used internally for compatibility checks + that should be insensitive to patch releases. + + version_a (str): The first version + version_b (str): The second version. + RETURNS (bool): Whether the versions match. + """ + a = get_minor_version(version_a) + b = get_minor_version(version_b) + return a is not None and b is not None and a == b + + def load_meta(path: Union[str, Path]) -> Dict[str, Any]: """Load a model meta.json from a path and validate its contents.