Various small tweaks to project CLI (#5965)

* Fix up/download of http and local paths

* Support git_sparse_checkout for assets

* Fix scorer

* Handle already-present directories for git assets

* Improve convert command

* Fix support for existant files in git assets

* Support branches in git sparse checkout

* Format

* Fix git assets

* Document git block in assets

* Fix test

* Fix test

* Revert "Fix test"

This reverts commit cf3097260f.

* Revert "Fix test"

This reverts commit 964d636e27.

* Dont multiply p/r/f by 100

* Display scores * 100 during training
This commit is contained in:
Matthew Honnibal 2020-08-25 00:30:52 +02:00 committed by GitHub
parent abd3f2b65a
commit 8038b87f04
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 136 additions and 45 deletions

View File

@ -1,5 +1,6 @@
from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING
import sys
import shutil
from pathlib import Path
from wasabi import msg
import srsly
@ -11,7 +12,7 @@ from thinc.config import Config, ConfigValidationError
from configparser import InterpolationError
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file
from ..util import import_file, run_command, make_tempdir
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@ -260,6 +261,19 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None:
src (Path): The source path.
url (str): The destination URL to upload to.
"""
import smart_open
# This logic is pretty hacky. We'd like pathy to do this probably?
if ":/" not in str(dest):
# Local path
with Path(dest).open(mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())
elif str(dest).startswith("http") or str(dest).startswith("https"):
with smart_open.open(str(dest), mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())
else:
dest = ensure_pathy(dest)
with dest.open(mode="wb") as output_file:
with src.open(mode="rb") as input_file:
@ -274,8 +288,20 @@ def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False)
force (bool): Whether to force download even if file exists.
If False, the download will be skipped.
"""
import smart_open
# This logic is pretty hacky. We'd like pathy to do this probably?
if dest.exists() and not force:
return None
if src.startswith("http"):
with smart_open.open(src, mode="rb") as input_file:
with dest.open(mode="wb") as output_file:
output_file.write(input_file.read())
elif ":/" not in src:
with open(src, mode="rb") as input_file:
with dest.open(mode="wb") as output_file:
output_file.write(input_file.read())
else:
src = ensure_pathy(src)
with src.open(mode="rb") as input_file:
with dest.open(mode="wb") as output_file:
@ -288,3 +314,29 @@ def ensure_pathy(path):
from pathy import Pathy # noqa: F811
return Pathy(path)
def git_sparse_checkout(
repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
):
if dest.exists():
raise IOError("Destination of checkout must not exist")
if not dest.parent.exists():
raise IOError("Parent of destination of checkout must exist")
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = (
f"git clone {repo} {tmp_dir} --no-checkout "
"--depth 1 --config core.sparseCheckout=true"
)
if branch is not None:
cmd = f"{cmd} -b {branch}"
run_command(cmd)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(subpath)
run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"])
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
print(dest)
print(list(dest.iterdir()))

View File

@ -50,6 +50,7 @@ def convert_cli(
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
# fmt: on
):
"""
@ -82,6 +83,7 @@ def convert_cli(
converter=converter,
ner_map=ner_map,
lang=lang,
concatenate=concatenate,
silent=silent,
msg=msg,
)
@ -100,13 +102,15 @@ def convert(
converter: str = "auto",
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
concatenate: bool = False,
silent: bool = True,
msg: Optional[Printer],
) -> None:
if not msg:
msg = Printer(no_print=silent)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
for input_loc in walk_directory(Path(input_path)):
doc_files = []
for input_loc in walk_directory(Path(input_path), converter):
input_data = input_loc.open("r", encoding="utf-8").read()
# Use converter function to convert data
func = CONVERTERS[converter]
@ -121,6 +125,13 @@ def convert(
no_print=silent,
ner_map=ner_map,
)
doc_files.append((input_loc, docs))
if concatenate:
all_docs = []
for _, docs in doc_files:
all_docs.extend(docs)
doc_files = [(input_path, all_docs)]
for input_loc, docs in doc_files:
if file_type == "json":
data = [docs_to_json(docs)]
else:
@ -174,7 +185,7 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None
def walk_directory(path: Path) -> List[Path]:
def walk_directory(path: Path, converter: str) -> List[Path]:
if not path.is_dir():
return [path]
paths = [path]
@ -188,6 +199,12 @@ def walk_directory(path: Path) -> List[Path]:
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif converter == "json" and not path.parts[-1].endswith("json"):
continue
elif converter == "conll" and not path.parts[-1].endswith("conll"):
continue
elif converter == "iob" and not path.parts[-1].endswith("iob"):
continue
else:
locs.append(path)
return locs
@ -214,11 +231,11 @@ def verify_cli_args(
if ner_map is not None and not Path(ner_map).exists():
msg.fail("NER map not found", ner_map, exits=1)
if input_path.is_dir():
input_locs = walk_directory(input_path)
input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2:
if converter == "auto" and len(file_types) >= 2:
file_types = ",".join(file_types)
msg.fail("All input files must be same type", file_types, exits=1)
if converter != "auto" and converter not in CONVERTERS:
@ -227,7 +244,7 @@ def verify_cli_args(
def _get_converter(msg, converter, input_path):
if input_path.is_dir():
input_path = walk_directory(input_path)[0]
input_path = walk_directory(input_path, converter)[0]
if converter == "auto":
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":

View File

@ -7,7 +7,7 @@ import requests
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
from .._util import download_file
from .._util import download_file, git_sparse_checkout
# TODO: find a solution for caches
@ -45,9 +45,24 @@ def project_assets(project_dir: Path) -> None:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)")
for asset in assets:
dest = asset["dest"]
url = asset.get("url")
dest = Path(asset["dest"])
checksum = asset.get("checksum")
if "git" in asset:
if dest.exists():
# If there's already a file, check for checksum
if checksum and checksum == get_checksum(dest):
msg.good(f"Skipping download with matching checksum: {dest}")
continue
else:
shutil.rmtree(dest)
git_sparse_checkout(
asset["git"]["repo"],
asset["git"]["path"],
dest,
branch=asset["git"].get("branch"),
)
else:
url = asset.get("url")
if not url:
# project.yml defines asset without URL that the user has to place
check_private_asset(dest, checksum)

View File

@ -6,8 +6,9 @@ import shutil
import re
from ... import about
from ...util import ensure_path, run_command, make_tempdir
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_sparse_checkout
@project_cli.command("clone")
@ -39,24 +40,11 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N
check_clone(name, dest, repo)
project_dir = dest.resolve()
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
try:
run_command(cmd)
except subprocess.CalledProcessError:
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
msg.fail(err)
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
f.write(name)
try:
run_command(["git", "-C", str(tmp_dir), "fetch"])
run_command(["git", "-C", str(tmp_dir), "checkout"])
git_sparse_checkout(repo, name, dest)
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'"
msg.fail(err)
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(name)), str(project_dir))
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory")

View File

@ -382,7 +382,7 @@ def setup_printer(
try:
scores = [
"{0:.2f}".format(float(info["other_scores"].get(col, 0.0)))
"{0:.2f}".format(float(info["other_scores"].get(col, 0.0)) * 100)
for col in score_cols
]
except KeyError as e:

View File

@ -30,17 +30,17 @@ class PRFScore:
@property
def precision(self) -> float:
return (self.tp / (self.tp + self.fp + 1e-100)) * 100
return (self.tp / (self.tp + self.fp + 1e-100))
@property
def recall(self) -> float:
return (self.tp / (self.tp + self.fn + 1e-100)) * 100
return (self.tp / (self.tp + self.fn + 1e-100))
@property
def fscore(self) -> float:
p = self.precision
r = self.recall
return (2 * ((p * r) / (p + r + 1e-100))) * 100
return 2 * ((p * r) / (p + r + 1e-100))
def to_dict(self) -> Dict[str, float]:
return {"p": self.precision, "r": self.recall, "f": self.fscore}

View File

@ -102,6 +102,25 @@ $ cd some_example_project
$ python -m spacy project assets
```
Asset URLs can be a number of different protocols: HTTP, HTTPS, FTP, SSH, and
even cloud storage such as GCS and S3. You can also fetch assets using git, by
replacing the `url` string with a `git` block, like this:
> #### project.yml
>
> ```yaml
> assets:
> - dest: 'assets/training.spacy'
> git:
> repo: "https://github.com/example/repo"
> branch: "master"
> path: "some/path"
> checksum: '63373dd656daa1fd3043ce166a59474c'
> ```
spaCy will use Git's "sparse checkout" feature, to avoid download the whole
repository.
### 3. Run a command {#run}
> #### project.yml