mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge remote-tracking branch 'upstream/develop' into fix/small-edits
# Conflicts: # spacy/cli/project.py
This commit is contained in:
commit
b311ce982f
|
@ -16,6 +16,7 @@ from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model # noqa: F401
|
from .init_model import init_model # noqa: F401
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project import project_clone, project_assets, project_run # noqa: F401
|
from .project import project_clone, project_assets, project_run # noqa: F401
|
||||||
|
from .project import project_run_all # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional, Sequence
|
||||||
import typer
|
import typer
|
||||||
import srsly
|
import srsly
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -22,6 +22,7 @@ from ..util import get_hash, get_checksum
|
||||||
|
|
||||||
CONFIG_FILE = "project.yml"
|
CONFIG_FILE = "project.yml"
|
||||||
DVC_CONFIG = "dvc.yaml"
|
DVC_CONFIG = "dvc.yaml"
|
||||||
|
DVC_DIR = ".dvc"
|
||||||
DIRS = [
|
DIRS = [
|
||||||
"assets",
|
"assets",
|
||||||
"metas",
|
"metas",
|
||||||
|
@ -49,7 +50,7 @@ Version Control) to manage input and output files and to ensure steps are only
|
||||||
re-run if their inputs change.
|
re-run if their inputs change.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
project_cli = typer.Typer(help=CLI_HELP)
|
project_cli = typer.Typer(help=CLI_HELP, no_args_is_help=True)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.callback(invoke_without_command=True)
|
@project_cli.callback(invoke_without_command=True)
|
||||||
|
@ -91,6 +92,7 @@ def project_clone_cli(
|
||||||
def project_init_cli(
|
def project_init_cli(
|
||||||
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
|
path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
|
||||||
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
|
||||||
|
force: bool = Opt(False, "--force", "-F", help="Force initiziation"),
|
||||||
):
|
):
|
||||||
"""Initialize a project directory with DVC and optionally Git. This should
|
"""Initialize a project directory with DVC and optionally Git. This should
|
||||||
typically be taken care of automatically when you run the "project clone"
|
typically be taken care of automatically when you run the "project clone"
|
||||||
|
@ -98,7 +100,7 @@ def project_init_cli(
|
||||||
be a Git repo, it should be initialized with Git first, before initializing
|
be a Git repo, it should be initialized with Git first, before initializing
|
||||||
DVC. This allows DVC to integrate with Git.
|
DVC. This allows DVC to integrate with Git.
|
||||||
"""
|
"""
|
||||||
project_init(path, git=git, silent=True)
|
project_init(path, git=git, force=force, silent=True)
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("assets")
|
@project_cli.command("assets")
|
||||||
|
@ -252,7 +254,7 @@ def project_clone(
|
||||||
if not dir_path.exists():
|
if not dir_path.exists():
|
||||||
dir_path.mkdir(parents=True)
|
dir_path.mkdir(parents=True)
|
||||||
if not no_init:
|
if not no_init:
|
||||||
project_init(project_dir, git=git, silent=True)
|
project_init(project_dir, git=git, force=True, silent=True)
|
||||||
msg.good(f"Your project is now ready!", dest)
|
msg.good(f"Your project is now ready!", dest)
|
||||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||||
|
|
||||||
|
@ -261,6 +263,7 @@ def project_init(
|
||||||
project_dir: Path,
|
project_dir: Path,
|
||||||
*,
|
*,
|
||||||
git: bool = False,
|
git: bool = False,
|
||||||
|
force: bool = False,
|
||||||
silent: bool = False,
|
silent: bool = False,
|
||||||
analytics: bool = False,
|
analytics: bool = False,
|
||||||
):
|
):
|
||||||
|
@ -271,19 +274,29 @@ def project_init(
|
||||||
silent (bool): Don't print any output (via DVC).
|
silent (bool): Don't print any output (via DVC).
|
||||||
analytics (bool): Opt-in to DVC analytics (defaults to False).
|
analytics (bool): Opt-in to DVC analytics (defaults to False).
|
||||||
"""
|
"""
|
||||||
|
project_dir = project_dir.resolve()
|
||||||
with working_dir(project_dir):
|
with working_dir(project_dir):
|
||||||
|
if git:
|
||||||
|
run_command(["git", "init"])
|
||||||
init_cmd = ["dvc", "init"]
|
init_cmd = ["dvc", "init"]
|
||||||
if silent:
|
if silent:
|
||||||
init_cmd.append("--quiet")
|
init_cmd.append("--quiet")
|
||||||
if not git:
|
if not git:
|
||||||
init_cmd.append("--no-scm")
|
init_cmd.append("--no-scm")
|
||||||
if git:
|
if force:
|
||||||
run_command(["git", "init"])
|
init_cmd.append("--force")
|
||||||
run_command(init_cmd)
|
run_command(init_cmd)
|
||||||
# We don't want to have analytics on by default – our users should
|
# We don't want to have analytics on by default – our users should
|
||||||
# opt-in explicitly. If they want it, they can always enable it.
|
# opt-in explicitly. If they want it, they can always enable it.
|
||||||
if not analytics:
|
if not analytics:
|
||||||
run_command(["dvc", "config", "core.analytics", "false"])
|
run_command(["dvc", "config", "core.analytics", "false"])
|
||||||
|
# Remove unused and confusing plot templates from .dvc directory
|
||||||
|
# TODO: maybe we shouldn't do this, but it's otherwise super confusing
|
||||||
|
# once you commit your changes via Git and it creates a bunch of files
|
||||||
|
# that have no purpose
|
||||||
|
plots_dir = project_dir / DVC_DIR / "plots"
|
||||||
|
if plots_dir.exists():
|
||||||
|
shutil.rmtree(str(plots_dir))
|
||||||
config = load_project_config(project_dir)
|
config = load_project_config(project_dir)
|
||||||
setup_check_dvc(project_dir, config)
|
setup_check_dvc(project_dir, config)
|
||||||
|
|
||||||
|
@ -301,15 +314,21 @@ def project_assets(project_dir: Path) -> None:
|
||||||
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||||
variables = config.get("variables", {})
|
variables = config.get("variables", {})
|
||||||
|
fetched_assets = []
|
||||||
for asset in assets:
|
for asset in assets:
|
||||||
url = asset["url"].format(**variables)
|
url = asset["url"].format(**variables)
|
||||||
dest = asset["dest"].format(**variables)
|
dest = asset["dest"].format(**variables)
|
||||||
fetch_asset(project_path, url, dest, asset.get("checksum"))
|
fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
|
||||||
|
if fetched_path:
|
||||||
|
fetched_assets.append(str(fetched_path))
|
||||||
|
if fetched_assets:
|
||||||
|
with working_dir(project_path):
|
||||||
|
run_command(["dvc", "add", *fetched_assets, "--external"])
|
||||||
|
|
||||||
|
|
||||||
def fetch_asset(
|
def fetch_asset(
|
||||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||||
) -> None:
|
) -> Optional[Path]:
|
||||||
"""Fetch an asset from a given URL or path. Will try to import the file
|
"""Fetch an asset from a given URL or path. Will try to import the file
|
||||||
using DVC's import-url if possible (fully tracked and versioned) and falls
|
using DVC's import-url if possible (fully tracked and versioned) and falls
|
||||||
back to get-url (versioned) and a non-DVC download if necessary. If a
|
back to get-url (versioned) and a non-DVC download if necessary. If a
|
||||||
|
@ -319,6 +338,8 @@ def fetch_asset(
|
||||||
project_path (Path): Path to project directory.
|
project_path (Path): Path to project directory.
|
||||||
url (str): URL or path to asset.
|
url (str): URL or path to asset.
|
||||||
checksum (Optional[str]): Optional expected checksum of local file.
|
checksum (Optional[str]): Optional expected checksum of local file.
|
||||||
|
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||||
|
the asset failed.
|
||||||
"""
|
"""
|
||||||
url = convert_asset_url(url)
|
url = convert_asset_url(url)
|
||||||
dest_path = (project_path / dest).resolve()
|
dest_path = (project_path / dest).resolve()
|
||||||
|
@ -327,8 +348,7 @@ def fetch_asset(
|
||||||
# TODO: add support for caches (dvc import-url with local path)
|
# TODO: add support for caches (dvc import-url with local path)
|
||||||
if checksum == get_checksum(dest_path):
|
if checksum == get_checksum(dest_path):
|
||||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||||
return
|
return dest_path
|
||||||
dvc_add_cmd = ["dvc", "add", str(dest_path), "--external"]
|
|
||||||
with working_dir(project_path):
|
with working_dir(project_path):
|
||||||
try:
|
try:
|
||||||
# If these fail, we don't want to output an error or info message.
|
# If these fail, we don't want to output an error or info message.
|
||||||
|
@ -340,16 +360,16 @@ def fetch_asset(
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
|
||||||
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
|
||||||
run_command(dvc_add_cmd)
|
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
try:
|
try:
|
||||||
download_file(url, dest_path)
|
download_file(url, dest_path)
|
||||||
except requests.exceptions.HTTPError as e:
|
except requests.exceptions.HTTPError as e:
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
msg.fail(f"Download failed: {dest}", e)
|
||||||
run_command(dvc_add_cmd)
|
return None
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
if checksum and checksum != get_checksum(dest_path):
|
||||||
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
|
msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
|
||||||
msg.good(f"Fetched asset {dest}")
|
msg.good(f"Fetched asset {dest}")
|
||||||
|
return dest_path
|
||||||
|
|
||||||
|
|
||||||
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
def project_run_all(project_dir: Path, *dvc_args) -> None:
|
||||||
|
@ -378,8 +398,7 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||||
config_commands = config.get("commands", [])
|
config_commands = config.get("commands", [])
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||||
if subcommand:
|
if subcommand:
|
||||||
if subcommand not in commands:
|
validate_subcommand(commands.keys(), subcommand)
|
||||||
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
|
|
||||||
print(f"Usage: {COMMAND} project run {project_dir} {subcommand}")
|
print(f"Usage: {COMMAND} project run {project_dir} {subcommand}")
|
||||||
help_text = commands[subcommand].get("help")
|
help_text = commands[subcommand].get("help")
|
||||||
if help_text:
|
if help_text:
|
||||||
|
@ -407,8 +426,7 @@ def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
|
||||||
config_commands = config.get("commands", [])
|
config_commands = config.get("commands", [])
|
||||||
variables = config.get("variables", {})
|
variables = config.get("variables", {})
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||||
if subcommand not in commands:
|
validate_subcommand(commands.keys(), subcommand)
|
||||||
msg.fail(f"Can't find command '{subcommand}' in project config", exits=1)
|
|
||||||
if subcommand in config.get("run", []):
|
if subcommand in config.get("run", []):
|
||||||
# This is one of the pipeline commands tracked in DVC
|
# This is one of the pipeline commands tracked in DVC
|
||||||
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
|
||||||
|
@ -454,10 +472,14 @@ def load_project_config(path: Path) -> Dict[str, Any]:
|
||||||
config_path = path / CONFIG_FILE
|
config_path = path / CONFIG_FILE
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
msg.fail("Can't find project config", config_path, exits=1)
|
msg.fail("Can't find project config", config_path, exits=1)
|
||||||
|
invalid_err = f"Invalid project config in {CONFIG_FILE}"
|
||||||
|
try:
|
||||||
config = srsly.read_yaml(config_path)
|
config = srsly.read_yaml(config_path)
|
||||||
|
except ValueError as e:
|
||||||
|
msg.fail(invalid_err, e, exits=1)
|
||||||
errors = validate(ProjectConfigSchema, config)
|
errors = validate(ProjectConfigSchema, config)
|
||||||
if errors:
|
if errors:
|
||||||
msg.fail(f"Invalid project config in {CONFIG_FILE}", "\n".join(errors), exits=1)
|
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
@ -496,8 +518,7 @@ def update_dvc_config(
|
||||||
# commands in project.yml and should be run in sequence
|
# commands in project.yml and should be run in sequence
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||||
for name in config.get("run", []):
|
for name in config.get("run", []):
|
||||||
if name not in config_commands:
|
validate_subcommand(config_commands.keys(), name)
|
||||||
msg.fail(f"Can't find command '{name}' in project config", exits=1)
|
|
||||||
command = config_commands[name]
|
command = config_commands[name]
|
||||||
deps = command.get("deps", [])
|
deps = command.get("deps", [])
|
||||||
outputs = command.get("outputs", [])
|
outputs = command.get("outputs", [])
|
||||||
|
@ -580,9 +601,9 @@ def run_commands(
|
||||||
command = command.format(**variables)
|
command = command.format(**variables)
|
||||||
command = shlex.split(command, posix=not is_windows)
|
command = shlex.split(command, posix=not is_windows)
|
||||||
# TODO: is this needed / a good idea?
|
# TODO: is this needed / a good idea?
|
||||||
if len(command) and command[0] == "python":
|
if len(command) and command[0] in ("python", "python3"):
|
||||||
command[0] = sys.executable
|
command[0] = sys.executable
|
||||||
elif len(command) and command[0] == "pip":
|
elif len(command) and command[0] in ("pip", "pip3"):
|
||||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||||
if not silent:
|
if not silent:
|
||||||
print(" ".join(command))
|
print(" ".join(command))
|
||||||
|
@ -640,6 +661,20 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
|
||||||
|
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||||
|
|
||||||
|
commands (Sequence[str]): The available commands.
|
||||||
|
subcommand (str): The subcommand.
|
||||||
|
"""
|
||||||
|
if subcommand not in commands:
|
||||||
|
msg.fail(
|
||||||
|
f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
|
||||||
|
f"Available commands: {', '.join(commands)}",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||||
"""Download a file using requests.
|
"""Download a file using requests.
|
||||||
|
|
||||||
|
|
|
@ -729,12 +729,13 @@ cdef class ArcEager(TransitionSystem):
|
||||||
cdef ArcEagerGold gold_ = gold
|
cdef ArcEagerGold gold_ = gold
|
||||||
gold_.update(stcls)
|
gold_.update(stcls)
|
||||||
gold_state = gold_.c
|
gold_state = gold_.c
|
||||||
n_gold = 0
|
cdef int n_gold = 0
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
||||||
is_valid[i] = True
|
is_valid[i] = True
|
||||||
costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
|
costs[i] = self.c[i].get_cost(stcls, &gold_state, self.c[i].label)
|
||||||
n_gold += costs[i] <= 0
|
if costs[i] <= 0:
|
||||||
|
n_gold += 1
|
||||||
else:
|
else:
|
||||||
is_valid[i] = False
|
is_valid[i] = False
|
||||||
costs[i] = 9000
|
costs[i] = 9000
|
||||||
|
|
Loading…
Reference in New Issue
Block a user