From 3e83a509bb21d2d315aa1f54f4615d196cb19ea2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 10 Sep 2020 15:49:13 +0200 Subject: [PATCH 1/5] WIP: fix project clone compatibility --- spacy/cli/_util.py | 57 ++++++++++++++++++++++++++------------ spacy/cli/project/clone.py | 2 +- spacy/util.py | 2 +- 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index b03f3eb69..4694fddbb 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING +from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING import sys import shutil from pathlib import Path @@ -321,41 +321,62 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: + git_version = get_git_version() + supports_sparse = git_version >= (2, 22) # This is the "clone, but don't download anything" part. - cmd = ( - f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " - f"--filter=blob:none " # <-- The key bit - f"-b {branch}" - ) + cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} " + if supports_sparse: + cmd += f"--filter=blob:none" # <-- The key bit + else: + msg.warn( + f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " + f"that doesn't fully support sparse checkout yet. This means that " + f"more files than necessary may be cloned. To only download the " + f"files needed, upgrade to Git v2.22 or above." + ) _attempt_run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. - cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" + cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}" ret = _attempt_run_command(cmd) git_repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) - if not missings: - err = f"Could not find any relevant files for '{subpath}'. " \ - f"Did you specify a correct and complete path within repo '{repo}' " \ - f"and branch {branch}?" - msg.fail(err, exits=1) - cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" - _attempt_run_command(cmd) + if supports_sparse and not missings: + err = ( + f"Could not find any relevant files for '{subpath}'. " + f"Did you specify a correct and complete path within repo '{repo}' " + f"and branch {branch}?" + ) + msg.fail(err, exits=1) + if supports_sparse: + cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" + _attempt_run_command(cmd) # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" _attempt_run_command(cmd) # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) -def _attempt_run_command(cmd): + +def get_git_version() -> Tuple[int, int]: + ret = _attempt_run_command(["git", "--version"]) + # TODO: this seems kinda brittle? + version = ret.stdout[11:].strip().split(".") + return (int(version[0]), int(version[1])) + + +def _attempt_run_command(cmd: Union[str, List[str]]): try: return run_command(cmd, capture=True) except subprocess.CalledProcessError as e: - err = f"Could not run command: {cmd}." - msg.fail(err, exits=1) + err = f"Could not run command" + msg.fail(err) + print(cmd) + sys.exit(1) -def _from_http_to_git(repo): + +def _from_http_to_git(repo: str) -> str: if repo.startswith("http://"): repo = repo.replace(r"http://", r"https://") if repo.startswith(r"https://"): diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index 427df490f..ab617e4ba 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -27,7 +27,7 @@ def project_clone_cli( DOCS: https://nightly.spacy.io/api/cli#project-clone """ if dest is None: - dest = Path.cwd() / name + dest = Path.cwd() / Path(name).parts[-1] project_clone(name, dest, repo=repo, branch=branch) diff --git a/spacy/util.py b/spacy/util.py index bd567ddc7..d8df04554 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -648,7 +648,7 @@ def join_command(command: List[str]) -> str: return " ".join(shlex.quote(cmd) for cmd in command) -def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None: +def run_command(command: Union[str, List[str]], *, capture=False, stdin=None): """Run a command on the command line as a subprocess. If the subprocess returns a non-zero exit code, a system exit is performed. From 1bce432b4af2c3bc3ea98a8bad54690f6ff01ec3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 11 Sep 2020 10:00:49 +0200 Subject: [PATCH 2/5] Adjust message [ci skip] --- spacy/cli/_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 1c4a16f7a..c64aa1507 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -331,8 +331,8 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m msg.warn( f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " f"that doesn't fully support sparse checkout yet. This means that " - f"more files than necessary may be cloned. To only download the " - f"files needed, upgrade to Git v2.22 or above." + f"more files than necessary may be downloaded temporarily. To " + f"only download the files needed, upgrade to Git v2.22 or above." ) _attempt_run_command(cmd) # Now we need to find the missing filenames for the subpath we want. From 0b2e07215db6eba5b890d480a0642a1cdb013878 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 11 Sep 2020 11:38:28 +0200 Subject: [PATCH 3/5] Support overwriting name on spacy package --- spacy/cli/package.py | 5 +++++ website/docs/api/cli.md | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index c457b3e17..8d6cd84c1 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -18,6 +18,7 @@ def package_cli( output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), + name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"), @@ -38,6 +39,7 @@ def package_cli( input_dir, output_dir, meta_path=meta_path, + name=name, version=version, create_meta=create_meta, create_sdist=not no_sdist, @@ -50,6 +52,7 @@ def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, + name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, @@ -71,6 +74,8 @@ def package( msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) + if name is not None: + meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index ea61b9ae3..47af9be96 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -852,7 +852,7 @@ this, you can set the `--no-sdist` flag. ```cli -$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--version] [--force] +$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force] ``` > #### Example @@ -870,6 +870,7 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] | `--meta-path`, `-m` 2 | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ | | `--create-meta`, `-C` 2 | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ | | `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ | +| `--name`, `-n` 3 | Package name to override in meta. ~~Optional[str] \(option)~~ | | `--version`, `-v` 3 | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ | | `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | From 62eec33bc43150636b9c1a8be561cb9fb4f58425 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 11 Sep 2020 11:38:33 +0200 Subject: [PATCH 4/5] Fix meta.json validation --- spacy/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index baa893802..38f47c668 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -180,7 +180,7 @@ class ModelMetaSchema(BaseModel): url: StrictStr = Field("", title="Model author URL") sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") vectors: Dict[str, Any] = Field({}, title="Included word vectors") - labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name") + labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name") accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers") speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") From 711166a75a9843fb454d23d795cb77d966ed96e7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 11 Sep 2020 15:12:05 +0200 Subject: [PATCH 5/5] prevent overwriting score_weights --- spacy/language.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 777b0c24b..70dad59f3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -243,7 +243,8 @@ class Language: self._config["nlp"]["pipeline"] = list(self.component_names) self._config["nlp"]["disabled"] = list(self.disabled) self._config["components"] = pipeline - self._config["training"]["score_weights"] = combine_score_weights(score_weights) + if not self._config["training"].get("score_weights"): + self._config["training"]["score_weights"] = combine_score_weights(score_weights) if not srsly.is_json_serializable(self._config): raise ValueError(Errors.E961.format(config=self._config)) return self._config