Merge pull request #6202 from explosion/feature/project-spacy-version

This commit is contained in:
Ines Montani 2020-10-05 21:40:52 +02:00 committed by GitHub
commit 568617af58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 130 additions and 20 deletions

View File

@ -1,7 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a34" __version__ = "3.0.0a34"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -1,4 +1,4 @@
from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
import sys import sys
import shutil import shutil
from pathlib import Path from pathlib import Path
@ -16,7 +16,8 @@ import os
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import ENV_VARS from ..util import is_compatible_version, ENV_VARS
from .. import about
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import Pathy # noqa: F401
@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
msg.fail(invalid_err) msg.fail(invalid_err)
print("\n".join(errors)) print("\n".join(errors))
sys.exit(1) sys.exit(1)
validate_project_version(config)
validate_project_commands(config) validate_project_commands(config)
# Make sure directories defined in config exist # Make sure directories defined in config exist
for subdir in config.get("directories", []): for subdir in config.get("directories", []):
@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
return dict(interpolated["project"]) return dict(interpolated["project"])
def validate_project_version(config: Dict[str, Any]) -> None:
"""If the project defines a compatible spaCy version range, chec that it's
compatible with the current version of spaCy.
config (Dict[str, Any]): The loaded config.
"""
spacy_version = config.get("spacy_version", None)
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
err = (
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
f"that's not compatible with the version of spaCy you're running "
f"({about.__version__}). You can edit version requirement in the "
f"{PROJECT_FILE} to load it, but the project may not run as expected."
)
msg.fail(err, exits=1)
def validate_project_commands(config: Dict[str, Any]) -> None: def validate_project_commands(config: Dict[str, Any]) -> None:
"""Check that project commands and workflows are valid, don't contain """Check that project commands and workflows are valid, don't contain
duplicates, don't clash and only refer to commands that exist. duplicates, don't clash and only refer to commands that exist.
@ -193,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
) )
def get_hash(data) -> str: def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
"""Get the hash for a JSON-serializable object. """Get the hash for a JSON-serializable object.
data: The data to hash. data: The data to hash.
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
RETURNS (str): The hash. RETURNS (str): The hash.
""" """
if isinstance(data, dict):
data = {k: v for k, v in data.items() if k not in exclude}
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest() return hashlib.md5(data_str).hexdigest()

View File

@ -7,7 +7,9 @@ import tarfile
from pathlib import Path from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import Pathy # noqa: F401
@ -129,7 +131,10 @@ def get_command_hash(
currently installed packages, whatever environment variables have been marked currently installed packages, whatever environment variables have been marked
as relevant, and the command. as relevant, and the command.
""" """
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)] check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd) hashes.extend(cmd)
creation_bytes = "".join(hashes).encode("utf8") creation_bytes = "".join(hashes).encode("utf8")
return hashlib.md5(creation_bytes).hexdigest() return hashlib.md5(creation_bytes).hexdigest()

View File

@ -4,8 +4,11 @@ from wasabi import msg
import sys import sys
import srsly import srsly
from ... import about
from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
from ...util import check_bool_env_var
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
@ -62,12 +65,13 @@ def project_run(
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
err_kwargs = {"exits": 1} if not dry else {} err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, err_help, **err_kwargs) msg.fail(err, err_help, **err_kwargs)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir: with working_dir(project_dir) as current_dir:
rerun = check_rerun(current_dir, cmd) msg.divider(subcommand)
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
if not rerun and not force: if not rerun and not force:
msg.info(f"Skipping '{cmd['name']}': nothing changed") msg.info(f"Skipping '{cmd['name']}': nothing changed")
else: else:
msg.divider(subcommand)
run_commands(cmd["script"], dry=dry) run_commands(cmd["script"], dry=dry)
if not dry: if not dry:
update_lockfile(current_dir, cmd) update_lockfile(current_dir, cmd)
@ -171,12 +175,19 @@ def validate_subcommand(
) )
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: def check_rerun(
project_dir: Path,
command: Dict[str, Any],
*,
check_spacy_version: bool = True,
check_spacy_commit: bool = False,
) -> bool:
"""Check if a command should be rerun because its settings or inputs/outputs """Check if a command should be rerun because its settings or inputs/outputs
changed. changed.
project_dir (Path): The current project directory. project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml. command (Dict[str, Any]): The command, as defined in the project.yml.
strict_version (bool):
RETURNS (bool): Whether to re-run the command. RETURNS (bool): Whether to re-run the command.
""" """
lock_path = project_dir / PROJECT_LOCK lock_path = project_dir / PROJECT_LOCK
@ -189,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
# Always run commands with no outputs (otherwise they'd always be skipped) # Always run commands with no outputs (otherwise they'd always be skipped)
if not entry.get("outs", []): if not entry.get("outs", []):
return True return True
# Always rerun if spaCy version or commit hash changed
spacy_v = entry.get("spacy_version")
commit = entry.get("spacy_git_version")
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
return True
if check_spacy_commit and commit != GIT_VERSION:
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
return True
# If the entry in the lockfile matches the lockfile entry that would be # If the entry in the lockfile matches the lockfile entry that would be
# generated from the current command, we don't rerun because it means that # generated from the current command, we don't rerun because it means that
# all inputs/outputs, hashes and scripts are the same and nothing changed # all inputs/outputs, hashes and scripts are the same and nothing changed
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry) lock_entry = get_lock_entry(project_dir, command)
exclude = ["spacy_version", "spacy_git_version"]
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
@ -231,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
"script": command["script"], "script": command["script"],
"deps": deps, "deps": deps,
"outs": [*outs, *outs_nc], "outs": [*outs, *outs_nc],
"spacy_version": about.__version__,
"spacy_git_version": GIT_VERSION,
} }

View File

@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel):
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
title: Optional[str] = Field(None, title="Project title") title: Optional[str] = Field(None, title="Project title")
spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -149,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
assert util.is_unconstrained_version(constraint) is expected assert util.is_unconstrained_version(constraint) is expected
@pytest.mark.parametrize(
"a1,a2,b1,b2,is_match",
[
("3.0.0", "3.0", "3.0.1", "3.0", True),
("3.1.0", "3.1", "3.2.1", "3.2", False),
("xxx", None, "1.2.3.dev0", "1.2", False),
],
)
def test_minor_version(a1, a2, b1, b2, is_match):
assert util.get_minor_version(a1) == a2
assert util.get_minor_version(b1) == b2
assert util.is_minor_version_match(a1, b1) is is_match
assert util.is_minor_version_match(a2, b2) is is_match
@pytest.mark.parametrize( @pytest.mark.parametrize(
"dot_notation,expected", "dot_notation,expected",
[ [

View File

@ -73,6 +73,7 @@ logger = logging.getLogger("spacy")
class ENV_VARS: class ENV_VARS:
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES" CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
class registry(thinc.registry): class registry(thinc.registry):
@ -584,6 +585,33 @@ def get_base_version(version: str) -> str:
return Version(version).base_version return Version(version).base_version
def get_minor_version(version: str) -> Optional[str]:
"""Get the major + minor version (without patch or prerelease identifiers).
version (str): The version.
RETURNS (str): The major + minor version or None if version is invalid.
"""
try:
v = Version(version)
except (TypeError, InvalidVersion):
return None
return f"{v.major}.{v.minor}"
def is_minor_version_match(version_a: str, version_b: str) -> bool:
"""Compare two versions and check if they match in major and minor, without
patch or prerelease identifiers. Used internally for compatibility checks
that should be insensitive to patch releases.
version_a (str): The first version
version_b (str): The second version.
RETURNS (bool): Whether the versions match.
"""
a = get_minor_version(version_a)
b = get_minor_version(version_b)
return a is not None and b is not None and a == b
def load_meta(path: Union[str, Path]) -> Dict[str, Any]: def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Load a model meta.json from a path and validate its contents. """Load a model meta.json from a path and validate its contents.
@ -1315,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool:
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]] cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
return hasattr(cls_func, attr) return hasattr(cls_func, attr)
return False return False
def check_bool_env_var(env_var: str) -> bool:
"""Convert the value of an environment variable to a boolean. Add special
check for "0" (falsy) and consider everything else truthy, except unset.
env_var (str): The name of the environment variable to check.
RETURNS (bool): Its boolean value.
"""
value = os.environ.get(env_var, False)
if value == "0":
return False
return bool(value)

View File

@ -216,15 +216,16 @@ pipelines.
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
``` ```
| Section | Description | | Section | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | | `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
### Data assets {#data-assets} ### Data assets {#data-assets}