From 520d25cb50675a45cdb6c646b30e3a5ea9b71bae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 26 Jul 2020 12:15:00 +0200 Subject: [PATCH] Add smart_open dependency to fetch project assets (#5812) * Use smart_open for project assets * Fix assets.py * Update pyproject.toml --- pyproject.toml | 3 ++- requirements.txt | 1 + setup.cfg | 1 + spacy/cli/project/assets.py | 23 +++++++---------------- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a3e32ca15..91f1464df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ requires = [ "murmurhash>=0.28.0,<1.1.0", "thinc>=8.0.0a19,<8.0.0a30", "blis>=0.4.0,<0.5.0", - "pytokenizations" + "pytokenizations", + "smart_open>=2.0.0,<3.0.0" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 089e4297d..d0413825b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 pydantic>=1.3.0,<2.0.0 pytokenizations +smart_open>=2.0.0,<3.0.0 # Official Python utilities setuptools packaging diff --git a/setup.cfg b/setup.cfg index 0ff26ab77..d2cb7c92a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,6 +52,7 @@ install_requires = requests>=2.13.0,<3.0.0 pydantic>=1.3.0,<2.0.0 pytokenizations + smart_open>=2.0.0,<3.0.0 # Official Python utilities setuptools packaging diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 2b7dbaf66..1bd28cb7e 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -1,15 +1,17 @@ from typing import Optional from pathlib import Path from wasabi import msg -import requests import tqdm import re import shutil +import requests +import smart_open from ...util import ensure_path, working_dir from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum + # TODO: find a solution for caches # CACHES = [ # Path.home() / ".torch", @@ -135,23 +137,12 @@ def convert_asset_url(url: str) -> str: def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: - """Download a file using requests. + """Download a file using smart_open. url (str): The URL of the file. dest (Path): The destination path. chunk_size (int): The size of chunks to read/write. """ - response = requests.get(url, stream=True) - response.raise_for_status() - total = int(response.headers.get("content-length", 0)) - progress_settings = { - "total": total, - "unit": "iB", - "unit_scale": True, - "unit_divisor": chunk_size, - "leave": False, - } - with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar: - for data in response.iter_content(chunk_size=chunk_size): - size = f.write(data) - bar.update(size) + with smart_open.open(url, mode="rb") as input_file: + with dest.open(mode="wb") as output_file: + output_file.write(input_file.read())