mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Merge pull request #6202 from explosion/feature/project-spacy-version
This commit is contained in:
commit
568617af58
|
@ -1,7 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a34"
|
__version__ = "3.0.0a34"
|
||||||
__release__ = True
|
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
|
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -16,7 +16,8 @@ import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import ENV_VARS
|
from ..util import is_compatible_version, ENV_VARS
|
||||||
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||||
msg.fail(invalid_err)
|
msg.fail(invalid_err)
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
validate_project_version(config)
|
||||||
validate_project_commands(config)
|
validate_project_commands(config)
|
||||||
# Make sure directories defined in config exist
|
# Make sure directories defined in config exist
|
||||||
for subdir in config.get("directories", []):
|
for subdir in config.get("directories", []):
|
||||||
|
@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
||||||
return dict(interpolated["project"])
|
return dict(interpolated["project"])
|
||||||
|
|
||||||
|
|
||||||
|
def validate_project_version(config: Dict[str, Any]) -> None:
|
||||||
|
"""If the project defines a compatible spaCy version range, chec that it's
|
||||||
|
compatible with the current version of spaCy.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The loaded config.
|
||||||
|
"""
|
||||||
|
spacy_version = config.get("spacy_version", None)
|
||||||
|
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
||||||
|
err = (
|
||||||
|
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
||||||
|
f"that's not compatible with the version of spaCy you're running "
|
||||||
|
f"({about.__version__}). You can edit version requirement in the "
|
||||||
|
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
||||||
|
)
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
"""Check that project commands and workflows are valid, don't contain
|
"""Check that project commands and workflows are valid, don't contain
|
||||||
duplicates, don't clash and only refer to commands that exist.
|
duplicates, don't clash and only refer to commands that exist.
|
||||||
|
@ -193,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data) -> str:
|
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
||||||
"""Get the hash for a JSON-serializable object.
|
"""Get the hash for a JSON-serializable object.
|
||||||
|
|
||||||
data: The data to hash.
|
data: The data to hash.
|
||||||
|
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
|
||||||
RETURNS (str): The hash.
|
RETURNS (str): The hash.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
data = {k: v for k, v in data.items() if k not in exclude}
|
||||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||||
return hashlib.md5(data_str).hexdigest()
|
return hashlib.md5(data_str).hexdigest()
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,9 @@ import tarfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
|
from ... import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -129,7 +131,10 @@ def get_command_hash(
|
||||||
currently installed packages, whatever environment variables have been marked
|
currently installed packages, whatever environment variables have been marked
|
||||||
as relevant, and the command.
|
as relevant, and the command.
|
||||||
"""
|
"""
|
||||||
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
|
check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
|
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
|
||||||
|
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
||||||
|
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
||||||
hashes.extend(cmd)
|
hashes.extend(cmd)
|
||||||
creation_bytes = "".join(hashes).encode("utf8")
|
creation_bytes = "".join(hashes).encode("utf8")
|
||||||
return hashlib.md5(creation_bytes).hexdigest()
|
return hashlib.md5(creation_bytes).hexdigest()
|
||||||
|
|
|
@ -4,8 +4,11 @@ from wasabi import msg
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from ... import about
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||||
from ...util import SimpleFrozenList
|
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||||
|
from ...util import check_bool_env_var
|
||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
||||||
|
|
||||||
|
@ -62,12 +65,13 @@ def project_run(
|
||||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||||
err_kwargs = {"exits": 1} if not dry else {}
|
err_kwargs = {"exits": 1} if not dry else {}
|
||||||
msg.fail(err, err_help, **err_kwargs)
|
msg.fail(err, err_help, **err_kwargs)
|
||||||
|
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
with working_dir(project_dir) as current_dir:
|
with working_dir(project_dir) as current_dir:
|
||||||
rerun = check_rerun(current_dir, cmd)
|
msg.divider(subcommand)
|
||||||
|
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
|
||||||
if not rerun and not force:
|
if not rerun and not force:
|
||||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||||
else:
|
else:
|
||||||
msg.divider(subcommand)
|
|
||||||
run_commands(cmd["script"], dry=dry)
|
run_commands(cmd["script"], dry=dry)
|
||||||
if not dry:
|
if not dry:
|
||||||
update_lockfile(current_dir, cmd)
|
update_lockfile(current_dir, cmd)
|
||||||
|
@ -171,12 +175,19 @@ def validate_subcommand(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
def check_rerun(
|
||||||
|
project_dir: Path,
|
||||||
|
command: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
check_spacy_version: bool = True,
|
||||||
|
check_spacy_commit: bool = False,
|
||||||
|
) -> bool:
|
||||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||||
changed.
|
changed.
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
project_dir (Path): The current project directory.
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
strict_version (bool):
|
||||||
RETURNS (bool): Whether to re-run the command.
|
RETURNS (bool): Whether to re-run the command.
|
||||||
"""
|
"""
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
@ -189,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
||||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||||
if not entry.get("outs", []):
|
if not entry.get("outs", []):
|
||||||
return True
|
return True
|
||||||
|
# Always rerun if spaCy version or commit hash changed
|
||||||
|
spacy_v = entry.get("spacy_version")
|
||||||
|
commit = entry.get("spacy_git_version")
|
||||||
|
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
|
||||||
|
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
|
||||||
|
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
|
||||||
|
return True
|
||||||
|
if check_spacy_commit and commit != GIT_VERSION:
|
||||||
|
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
|
||||||
|
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
|
||||||
|
return True
|
||||||
# If the entry in the lockfile matches the lockfile entry that would be
|
# If the entry in the lockfile matches the lockfile entry that would be
|
||||||
# generated from the current command, we don't rerun because it means that
|
# generated from the current command, we don't rerun because it means that
|
||||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||||
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
|
lock_entry = get_lock_entry(project_dir, command)
|
||||||
|
exclude = ["spacy_version", "spacy_git_version"]
|
||||||
|
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
|
||||||
|
|
||||||
|
|
||||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
||||||
|
@ -231,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
|
||||||
"script": command["script"],
|
"script": command["script"],
|
||||||
"deps": deps,
|
"deps": deps,
|
||||||
"outs": [*outs, *outs_nc],
|
"outs": [*outs, *outs_nc],
|
||||||
|
"spacy_version": about.__version__,
|
||||||
|
"spacy_git_version": GIT_VERSION,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel):
|
||||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
title: Optional[str] = Field(None, title="Project title")
|
title: Optional[str] = Field(None, title="Project title")
|
||||||
|
spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -149,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
|
||||||
assert util.is_unconstrained_version(constraint) is expected
|
assert util.is_unconstrained_version(constraint) is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"a1,a2,b1,b2,is_match",
|
||||||
|
[
|
||||||
|
("3.0.0", "3.0", "3.0.1", "3.0", True),
|
||||||
|
("3.1.0", "3.1", "3.2.1", "3.2", False),
|
||||||
|
("xxx", None, "1.2.3.dev0", "1.2", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_minor_version(a1, a2, b1, b2, is_match):
|
||||||
|
assert util.get_minor_version(a1) == a2
|
||||||
|
assert util.get_minor_version(b1) == b2
|
||||||
|
assert util.is_minor_version_match(a1, b1) is is_match
|
||||||
|
assert util.is_minor_version_match(a2, b2) is is_match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"dot_notation,expected",
|
"dot_notation,expected",
|
||||||
[
|
[
|
||||||
|
|
|
@ -73,6 +73,7 @@ logger = logging.getLogger("spacy")
|
||||||
|
|
||||||
class ENV_VARS:
|
class ENV_VARS:
|
||||||
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
|
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
|
||||||
|
PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
|
||||||
|
|
||||||
|
|
||||||
class registry(thinc.registry):
|
class registry(thinc.registry):
|
||||||
|
@ -584,6 +585,33 @@ def get_base_version(version: str) -> str:
|
||||||
return Version(version).base_version
|
return Version(version).base_version
|
||||||
|
|
||||||
|
|
||||||
|
def get_minor_version(version: str) -> Optional[str]:
|
||||||
|
"""Get the major + minor version (without patch or prerelease identifiers).
|
||||||
|
|
||||||
|
version (str): The version.
|
||||||
|
RETURNS (str): The major + minor version or None if version is invalid.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
v = Version(version)
|
||||||
|
except (TypeError, InvalidVersion):
|
||||||
|
return None
|
||||||
|
return f"{v.major}.{v.minor}"
|
||||||
|
|
||||||
|
|
||||||
|
def is_minor_version_match(version_a: str, version_b: str) -> bool:
|
||||||
|
"""Compare two versions and check if they match in major and minor, without
|
||||||
|
patch or prerelease identifiers. Used internally for compatibility checks
|
||||||
|
that should be insensitive to patch releases.
|
||||||
|
|
||||||
|
version_a (str): The first version
|
||||||
|
version_b (str): The second version.
|
||||||
|
RETURNS (bool): Whether the versions match.
|
||||||
|
"""
|
||||||
|
a = get_minor_version(version_a)
|
||||||
|
b = get_minor_version(version_b)
|
||||||
|
return a is not None and b is not None and a == b
|
||||||
|
|
||||||
|
|
||||||
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
||||||
"""Load a model meta.json from a path and validate its contents.
|
"""Load a model meta.json from a path and validate its contents.
|
||||||
|
|
||||||
|
@ -1315,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool:
|
||||||
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
||||||
return hasattr(cls_func, attr)
|
return hasattr(cls_func, attr)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def check_bool_env_var(env_var: str) -> bool:
|
||||||
|
"""Convert the value of an environment variable to a boolean. Add special
|
||||||
|
check for "0" (falsy) and consider everything else truthy, except unset.
|
||||||
|
|
||||||
|
env_var (str): The name of the environment variable to check.
|
||||||
|
RETURNS (bool): Its boolean value.
|
||||||
|
"""
|
||||||
|
value = os.environ.get(env_var, False)
|
||||||
|
if value == "0":
|
||||||
|
return False
|
||||||
|
return bool(value)
|
||||||
|
|
|
@ -216,15 +216,16 @@ pipelines.
|
||||||
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||||
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
||||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||||
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
||||||
|
| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
|
||||||
|
|
||||||
### Data assets {#data-assets}
|
### Data assets {#data-assets}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user