diff --git a/spacy/about.py b/spacy/about.py index 392bfd589..373d1d2b0 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,7 +1,6 @@ # fmt: off __title__ = "spacy-nightly" __version__ = "3.0.0a34" -__release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c959c9861..373650172 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING +from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING import sys import shutil from pathlib import Path @@ -16,7 +16,8 @@ import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import ENV_VARS +from ..util import is_compatible_version, ENV_VARS +from .. import about if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: msg.fail(invalid_err) print("\n".join(errors)) sys.exit(1) + validate_project_version(config) validate_project_commands(config) # Make sure directories defined in config exist for subdir in config.get("directories", []): @@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}): return dict(interpolated["project"]) +def validate_project_version(config: Dict[str, Any]) -> None: + """If the project defines a compatible spaCy version range, chec that it's + compatible with the current version of spaCy. + + config (Dict[str, Any]): The loaded config. + """ + spacy_version = config.get("spacy_version", None) + if spacy_version and not is_compatible_version(about.__version__, spacy_version): + err = ( + f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) " + f"that's not compatible with the version of spaCy you're running " + f"({about.__version__}). You can edit version requirement in the " + f"{PROJECT_FILE} to load it, but the project may not run as expected." + ) + msg.fail(err, exits=1) + + def validate_project_commands(config: Dict[str, Any]) -> None: """Check that project commands and workflows are valid, don't contain duplicates, don't clash and only refer to commands that exist. @@ -193,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None: ) -def get_hash(data) -> str: +def get_hash(data, exclude: Iterable[str] = tuple()) -> str: """Get the hash for a JSON-serializable object. data: The data to hash. + exclude (Iterable[str]): Top-level keys to exclude if data is a dict. RETURNS (str): The hash. """ + if isinstance(data, dict): + data = {k: v for k, v in data.items() if k not in exclude} data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") return hashlib.md5(data_str).hexdigest() diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py index e7e7cbbe8..6056458e2 100644 --- a/spacy/cli/project/remote_storage.py +++ b/spacy/cli/project/remote_storage.py @@ -7,7 +7,9 @@ import tarfile from pathlib import Path from .._util import get_hash, get_checksum, download_file, ensure_pathy -from ...util import make_tempdir +from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var +from ...git_info import GIT_VERSION +from ... import about if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -129,7 +131,10 @@ def get_command_hash( currently installed packages, whatever environment variables have been marked as relevant, and the command. """ - hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)] + check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) + spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__) + dep_checksums = [get_checksum(dep) for dep in sorted(deps)] + hashes = [spacy_v, site_hash, env_hash] + dep_checksums hashes.extend(cmd) creation_bytes = "".join(hashes).encode("utf8") return hashlib.md5(creation_bytes).hexdigest() diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 69c49fba7..1a9b447ea 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -4,8 +4,11 @@ from wasabi import msg import sys import srsly +from ... import about +from ...git_info import GIT_VERSION from ...util import working_dir, run_command, split_command, is_cwd, join_command -from ...util import SimpleFrozenList +from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS +from ...util import check_bool_env_var from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import get_checksum, project_cli, Arg, Opt, COMMAND @@ -62,12 +65,13 @@ def project_run( err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" err_kwargs = {"exits": 1} if not dry else {} msg.fail(err, err_help, **err_kwargs) + check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) with working_dir(project_dir) as current_dir: - rerun = check_rerun(current_dir, cmd) + msg.divider(subcommand) + rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit) if not rerun and not force: msg.info(f"Skipping '{cmd['name']}': nothing changed") else: - msg.divider(subcommand) run_commands(cmd["script"], dry=dry) if not dry: update_lockfile(current_dir, cmd) @@ -171,12 +175,19 @@ def validate_subcommand( ) -def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: +def check_rerun( + project_dir: Path, + command: Dict[str, Any], + *, + check_spacy_version: bool = True, + check_spacy_commit: bool = False, +) -> bool: """Check if a command should be rerun because its settings or inputs/outputs changed. project_dir (Path): The current project directory. command (Dict[str, Any]): The command, as defined in the project.yml. + strict_version (bool): RETURNS (bool): Whether to re-run the command. """ lock_path = project_dir / PROJECT_LOCK @@ -189,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: # Always run commands with no outputs (otherwise they'd always be skipped) if not entry.get("outs", []): return True + # Always rerun if spaCy version or commit hash changed + spacy_v = entry.get("spacy_version") + commit = entry.get("spacy_git_version") + if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__): + info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)" + msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}") + return True + if check_spacy_commit and commit != GIT_VERSION: + info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)" + msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}") + return True # If the entry in the lockfile matches the lockfile entry that would be # generated from the current command, we don't rerun because it means that # all inputs/outputs, hashes and scripts are the same and nothing changed - return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry) + lock_entry = get_lock_entry(project_dir, command) + exclude = ["spacy_version", "spacy_git_version"] + return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude) def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: @@ -231,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any] "script": command["script"], "deps": deps, "outs": [*outs, *outs_nc], + "spacy_version": about.__version__, + "spacy_git_version": GIT_VERSION, } diff --git a/spacy/schemas.py b/spacy/schemas.py index 591b7e134..0d88d4090 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel): workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") title: Optional[str] = Field(None, title="Project title") + spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with") # fmt: on class Config: diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index bdf54ad6a..b9a0a9d05 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -149,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected): assert util.is_unconstrained_version(constraint) is expected +@pytest.mark.parametrize( + "a1,a2,b1,b2,is_match", + [ + ("3.0.0", "3.0", "3.0.1", "3.0", True), + ("3.1.0", "3.1", "3.2.1", "3.2", False), + ("xxx", None, "1.2.3.dev0", "1.2", False), + ], +) +def test_minor_version(a1, a2, b1, b2, is_match): + assert util.get_minor_version(a1) == a2 + assert util.get_minor_version(b1) == b2 + assert util.is_minor_version_match(a1, b1) is is_match + assert util.is_minor_version_match(a2, b2) is is_match + + @pytest.mark.parametrize( "dot_notation,expected", [ diff --git a/spacy/util.py b/spacy/util.py index 4d68e829c..aa321b22f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -73,6 +73,7 @@ logger = logging.getLogger("spacy") class ENV_VARS: CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES" + PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION" class registry(thinc.registry): @@ -584,6 +585,33 @@ def get_base_version(version: str) -> str: return Version(version).base_version +def get_minor_version(version: str) -> Optional[str]: + """Get the major + minor version (without patch or prerelease identifiers). + + version (str): The version. + RETURNS (str): The major + minor version or None if version is invalid. + """ + try: + v = Version(version) + except (TypeError, InvalidVersion): + return None + return f"{v.major}.{v.minor}" + + +def is_minor_version_match(version_a: str, version_b: str) -> bool: + """Compare two versions and check if they match in major and minor, without + patch or prerelease identifiers. Used internally for compatibility checks + that should be insensitive to patch releases. + + version_a (str): The first version + version_b (str): The second version. + RETURNS (bool): Whether the versions match. + """ + a = get_minor_version(version_a) + b = get_minor_version(version_b) + return a is not None and b is not None and a == b + + def load_meta(path: Union[str, Path]) -> Dict[str, Any]: """Load a model meta.json from a path and validate its contents. @@ -1315,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool: cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] return hasattr(cls_func, attr) return False + + +def check_bool_env_var(env_var: str) -> bool: + """Convert the value of an environment variable to a boolean. Add special + check for "0" (falsy) and consider everything else truthy, except unset. + + env_var (str): The name of the environment variable to check. + RETURNS (bool): Its boolean value. + """ + value = os.environ.get(env_var, False) + if value == "0": + return False + return bool(value) diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 6d5746308..5fced922d 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -216,15 +216,16 @@ pipelines. %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` -| Section | Description | -| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | -| `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | -| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | -| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | -| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | -| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | +| Section | Description | +| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | +| `description` | An optional project description used in [auto-generated docs](#custom-docs). | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | +| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | +| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | +| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | +| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. | ### Data assets {#data-assets}