2021-09-21 11:49:45 +03:00
from typing import Any , Dict , Optional
2020-07-09 02:42:51 +03:00
from pathlib import Path
from wasabi import msg
2022-01-12 12:34:23 +03:00
import os
2020-07-09 02:42:51 +03:00
import re
import shutil
2020-07-26 13:15:00 +03:00
import requests
2021-09-21 11:49:45 +03:00
import typer
2020-07-09 02:42:51 +03:00
2020-07-10 00:51:18 +03:00
from . . . util import ensure_path , working_dir
2020-09-14 15:12:58 +03:00
from . . _util import project_cli , Arg , Opt , PROJECT_FILE , load_project_config
from . . _util import get_checksum , download_file , git_checkout , get_git_version
2021-09-21 11:49:45 +03:00
from . . _util import SimpleFrozenDict , parse_config_overrides
2020-07-09 02:42:51 +03:00
2021-09-21 11:49:45 +03:00
@project_cli.command (
" assets " ,
context_settings = { " allow_extra_args " : True , " ignore_unknown_options " : True } ,
)
2020-07-09 02:42:51 +03:00
def project_assets_cli (
# fmt: off
2021-09-21 11:49:45 +03:00
ctx : typer . Context , # This is only used to read additional arguments
2020-07-09 02:42:51 +03:00
project_dir : Path = Arg ( Path . cwd ( ) , help = " Path to cloned project. Defaults to current working directory. " , exists = True , file_okay = False ) ,
2020-09-14 15:12:58 +03:00
sparse_checkout : bool = Opt ( False , " --sparse " , " -S " , help = " Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+. " )
2020-07-09 02:42:51 +03:00
# fmt: on
) :
""" Fetch project assets like datasets and pretrained weights. Assets are
defined in the " assets " section of the project . yml . If a checksum is
provided in the project . yml , the file is only downloaded if no local file
with the same checksum exists .
2020-09-04 13:58:50 +03:00
2021-01-30 12:09:38 +03:00
DOCS : https : / / spacy . io / api / cli #project-assets
2020-07-09 02:42:51 +03:00
"""
2021-09-21 11:49:45 +03:00
overrides = parse_config_overrides ( ctx . args )
project_assets ( project_dir , overrides = overrides , sparse_checkout = sparse_checkout )
2020-07-09 02:42:51 +03:00
2021-09-21 11:49:45 +03:00
def project_assets (
project_dir : Path ,
* ,
overrides : Dict [ str , Any ] = SimpleFrozenDict ( ) ,
sparse_checkout : bool = False ,
) - > None :
2020-07-09 02:42:51 +03:00
""" Fetch assets for a project using DVC if possible.
project_dir ( Path ) : Path to project directory .
"""
project_path = ensure_path ( project_dir )
2021-09-21 11:49:45 +03:00
config = load_project_config ( project_path , overrides = overrides )
2020-07-09 02:42:51 +03:00
assets = config . get ( " assets " , { } )
if not assets :
msg . warn ( f " No assets specified in { PROJECT_FILE } " , exits = 0 )
msg . info ( f " Fetching { len ( assets ) } asset(s) " )
for asset in assets :
2020-09-11 10:56:49 +03:00
dest = ( project_dir / asset [ " dest " ] ) . resolve ( )
2020-07-09 02:42:51 +03:00
checksum = asset . get ( " checksum " )
2020-09-10 15:36:53 +03:00
if " git " in asset :
2020-09-13 11:52:28 +03:00
git_err = (
f " Cloning spaCy project templates requires Git and the ' git ' command. "
f " Make sure it ' s installed and that the executable is available. "
)
get_git_version ( error = git_err )
2020-09-10 15:53:02 +03:00
if dest . exists ( ) :
# If there's already a file, check for checksum
if checksum and checksum == get_checksum ( dest ) :
2020-09-11 10:57:00 +03:00
msg . good (
f " Skipping download with matching checksum: { asset [ ' dest ' ] } "
)
2020-09-10 15:53:02 +03:00
continue
else :
if dest . is_dir ( ) :
shutil . rmtree ( dest )
else :
dest . unlink ( )
2021-09-01 23:52:08 +03:00
if " repo " not in asset [ " git " ] or asset [ " git " ] [ " repo " ] is None :
2021-09-03 11:47:03 +03:00
msg . fail (
" A git asset must include ' repo ' , the repository address. " , exits = 1
)
2021-09-01 23:52:08 +03:00
if " path " not in asset [ " git " ] or asset [ " git " ] [ " path " ] is None :
2021-09-03 11:47:03 +03:00
msg . fail (
" A git asset must include ' path ' - use \" \" to get the entire repository. " ,
exits = 1 ,
)
2020-09-14 15:12:58 +03:00
git_checkout (
2020-08-25 01:30:52 +03:00
asset [ " git " ] [ " repo " ] ,
asset [ " git " ] [ " path " ] ,
dest ,
branch = asset [ " git " ] . get ( " branch " ) ,
2020-09-14 15:12:58 +03:00
sparse = sparse_checkout ,
2020-08-25 01:30:52 +03:00
)
2020-09-22 10:45:34 +03:00
msg . good ( f " Downloaded asset { dest } " )
2020-09-10 15:53:02 +03:00
else :
2020-08-25 01:30:52 +03:00
url = asset . get ( " url " )
if not url :
# project.yml defines asset without URL that the user has to place
check_private_asset ( dest , checksum )
continue
fetch_asset ( project_path , url , dest , checksum )
2020-07-09 02:42:51 +03:00
def check_private_asset ( dest : Path , checksum : Optional [ str ] = None ) - > None :
""" Check and validate assets without a URL (private assets that the user
has to provide themselves ) and give feedback about the checksum .
2020-09-10 15:36:53 +03:00
dest ( Path ) : Destination path of the asset .
2020-07-09 02:42:51 +03:00
checksum ( Optional [ str ] ) : Optional checksum of the expected file .
"""
if not Path ( dest ) . exists ( ) :
err = f " No URL provided for asset. You need to add this file yourself: { dest } "
msg . warn ( err )
else :
2020-09-12 18:02:53 +03:00
if not checksum :
msg . good ( f " Asset already exists: { dest } " )
elif checksum == get_checksum ( dest ) :
2020-07-09 02:42:51 +03:00
msg . good ( f " Asset exists with matching checksum: { dest } " )
else :
msg . fail ( f " Asset available but with incorrect checksum: { dest } " )
def fetch_asset (
project_path : Path , url : str , dest : Path , checksum : Optional [ str ] = None
) - > None :
""" Fetch an asset from a given URL or path. If a checksum is provided and a
local file exists , it ' s only re-downloaded if the checksum doesn ' t match .
project_path ( Path ) : Path to project directory .
url ( str ) : URL or path to asset .
checksum ( Optional [ str ] ) : Optional expected checksum of local file .
RETURNS ( Optional [ Path ] ) : The path to the fetched asset or None if fetching
the asset failed .
"""
dest_path = ( project_path / dest ) . resolve ( )
2022-01-12 12:34:23 +03:00
if dest_path . exists ( ) :
2020-07-09 02:42:51 +03:00
# If there's already a file, check for checksum
2022-01-12 12:34:23 +03:00
if checksum :
if checksum == get_checksum ( dest_path ) :
msg . good ( f " Skipping download with matching checksum: { dest } " )
return
else :
# If there's not a checksum, make sure the file is a possibly valid size
if os . path . getsize ( dest_path ) == 0 :
msg . warn ( f " Asset exists but with size of 0 bytes, deleting: { dest } " )
os . remove ( dest_path )
2020-07-10 00:51:18 +03:00
# We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path . parent . exists ( ) :
dest_path . parent . mkdir ( parents = True )
2020-07-09 02:42:51 +03:00
with working_dir ( project_path ) :
url = convert_asset_url ( url )
try :
download_file ( url , dest_path )
msg . good ( f " Downloaded asset { dest } " )
except requests . exceptions . RequestException as e :
if Path ( url ) . exists ( ) and Path ( url ) . is_file ( ) :
# If it's a local file, copy to destination
shutil . copy ( url , str ( dest_path ) )
msg . good ( f " Copied local asset { dest } " )
else :
msg . fail ( f " Download failed: { dest } " , e )
if checksum and checksum != get_checksum ( dest_path ) :
msg . fail ( f " Checksum doesn ' t match value defined in { PROJECT_FILE } : { dest } " )
def convert_asset_url ( url : str ) - > str :
""" Check and convert the asset URL if needed.
url ( str ) : The asset URL .
RETURNS ( str ) : The converted URL .
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
2020-08-26 11:26:57 +03:00
if re . match ( r " (http(s?)): \ / \ /github.com " , url ) and " releases/download " not in url :
2020-07-09 02:42:51 +03:00
converted = url . replace ( " github.com " , " raw.githubusercontent.com " )
converted = re . sub ( r " /(tree|blob)/ " , " / " , converted )
msg . warn (
" Downloading from a regular GitHub URL. This will only download "
" the source of the page, not the actual file. Converting the URL "
" to a raw URL. " ,
converted ,
)
return converted
return url