Add smart_open dependency to fetch project assets (#5812)

* Use smart_open for project assets

* Fix assets.py

* Update pyproject.toml
This commit is contained in:
Matthew Honnibal 2020-07-26 12:15:00 +02:00 committed by GitHub
parent c288dba8e7
commit 520d25cb50
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 11 additions and 17 deletions

View File

@ -8,6 +8,7 @@ requires = [
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a19,<8.0.0a30",
"blis>=0.4.0,<0.5.0",
"pytokenizations"
"pytokenizations",
"smart_open>=2.0.0,<3.0.0"
]
build-backend = "setuptools.build_meta"

View File

@ -15,6 +15,7 @@ requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.3.0,<2.0.0
pytokenizations
smart_open>=2.0.0,<3.0.0
# Official Python utilities
setuptools
packaging

View File

@ -52,6 +52,7 @@ install_requires =
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
pytokenizations
smart_open>=2.0.0,<3.0.0
# Official Python utilities
setuptools
packaging

View File

@ -1,15 +1,17 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import requests
import tqdm
import re
import shutil
import requests
import smart_open
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
# TODO: find a solution for caches
# CACHES = [
# Path.home() / ".torch",
@ -135,23 +137,12 @@ def convert_asset_url(url: str) -> str:
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
"""Download a file using requests.
"""Download a file using smart_open.
url (str): The URL of the file.
dest (Path): The destination path.
chunk_size (int): The size of chunks to read/write.
"""
response = requests.get(url, stream=True)
response.raise_for_status()
total = int(response.headers.get("content-length", 0))
progress_settings = {
"total": total,
"unit": "iB",
"unit_scale": True,
"unit_divisor": chunk_size,
"leave": False,
}
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
for data in response.iter_content(chunk_size=chunk_size):
size = f.write(data)
bar.update(size)
with smart_open.open(url, mode="rb") as input_file:
with dest.open(mode="wb") as output_file:
output_file.write(input_file.read())