From d4cc736b7c8f042e2385a16816aa3f9316478f8c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 May 2018 20:26:56 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Improve=20model=20downloads:=20c?= =?UTF-8?q?heck=20for=20existing=20install,=20customise=20pip=20and=20use?= =?UTF-8?q?=20requests=20library=20again=20(#2346)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Go back to using requests instead of urllib (closes #2320) Fewer dependencies are good, but this one was simply causing too many other problems around SSL verification and Python 2/3 compatibility. requests is a popular enough package that it's okay for spaCy to depend on it – and this will hopefully make model downloads less flakey. * Only download model if not installed (see #1456) Use #egg=model==version to allow pip to check for existing installations. The download is only started if no installation matching the package/version is found. Fixes a long-standing inconvenience. * Pass additional options to pip when installing model (resolves #1456) Treat all additional arguments passed to the download command as pip options to allow user to customise the command. For example: python -m spacy download en --user * Add CLI option to enable installing model package dependencies * Revert "Add CLI option to enable installing model package dependencies" This reverts commit 9336ffe6959a9d538a5059be7ea84a639b12a1ae. * Update documentation --- requirements.txt | 1 + setup.py | 3 +- spacy/cli/_messages.py | 2 +- spacy/cli/download.py | 66 +++++++++++++++--------------------------- spacy/cli/validate.py | 14 ++++----- spacy/compat.py | 21 -------------- website/api/cli.jade | 15 ++++++++-- 7 files changed, 48 insertions(+), 74 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4fda2da37..b2e245e36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,6 @@ plac<1.0.0,>=0.9.6 ujson>=1.35 dill>=0.2,<0.3 regex==2017.4.5 +requests>=2.13.0,<3.0.0 pytest>=3.0.6,<4.0.0 mock>=2.0.0,<3.0.0 diff --git a/setup.py b/setup.py index c295e05cf..f52f1ad0f 100755 --- a/setup.py +++ b/setup.py @@ -195,7 +195,8 @@ def setup_package(): 'pathlib', 'ujson>=1.35', 'dill>=0.2,<0.3', - 'regex==2017.4.5'], + 'regex==2017.4.5', + 'requests>=2.13.0,<3.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', diff --git a/spacy/cli/_messages.py b/spacy/cli/_messages.py index c3c9e496f..88dcb1b35 100644 --- a/spacy/cli/_messages.py +++ b/spacy/cli/_messages.py @@ -7,7 +7,7 @@ class Messages(object): M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " "don't have admin permissions?), but you can still load the " "model via its full package name: nlp = spacy.load('{name}')") - M003 = ("Server error ({code}: {desc})") + M003 = ("Server error ({code})") M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy " "installation (v{version}), and download it manually. For more " "details, see the documentation: https://spacy.io/usage/models") diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 558f84c7e..ec5043562 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -2,15 +2,14 @@ from __future__ import unicode_literals import plac +import requests import os import subprocess import sys -import ujson -from .link import link from ._messages import Messages +from .link import link from ..util import prints, get_package_path -from ..compat import url_read, HTTPError from .. import about @@ -18,41 +17,23 @@ from .. import about model=("model to download, shortcut or name)", "positional", None, str), direct=("force direct download. Needs model name with version and won't " "perform compatibility check", "flag", "d", bool), - insecure=("insecure mode - disables the verification of certificates", - "flag", "i", bool), - ca_file=("specify a certificate authority file to use for certificates " - "validation. Ignored if --insecure is used", "option", "c")) -def download(model, direct=False, insecure=False, ca_file=None): + pip_args=("additional arguments to be passed to `pip install` when " + "installing the model")) +def download(model, direct=False, *pip_args): """ Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name with version. - The --insecure optional flag can be used to disable ssl verification - The --ca-file option can be used to provide a local CA file - used for certificate verification. """ - - # ssl_verify is the argument handled to the 'verify' parameter - # of requests package. It must be either None, a boolean, - # or a string containing the path to CA file - ssl_verify = None - if insecure: - ca_file = None - ssl_verify = False - else: - if ca_file is not None: - ssl_verify = ca_file - - # Download the model if direct: - dl = download_model('{m}/{m}.tar.gz'.format(m=model)) + dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args) else: - shortcuts = get_json(about.__shortcuts__, "available shortcuts", ssl_verify) + shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = shortcuts.get(model, model) - compatibility = get_compatibility(ssl_verify) + compatibility = get_compatibility() version = get_version(model_name, compatibility) - dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, - v=version)) + dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}' + .format(m=model_name, v=version), pip_args) if dl != 0: # if download subprocess doesn't return 0, exit sys.exit(dl) try: @@ -69,19 +50,18 @@ def download(model, direct=False, insecure=False, ca_file=None): prints(Messages.M001.format(name=model_name), title=Messages.M002) -def get_json(url, desc, ssl_verify): - try: - data = url_read(url, verify=ssl_verify) - except HTTPError as e: - prints(Messages.M004.format(desc, about.__version__), - title=Messages.M003.format(e.code, e.reason), exits=1) - return ujson.loads(data) +def get_json(url, desc): + r = requests.get(url) + if r.status_code != 200: + prints(Messages.M004.format(desc=desc, version=about.__version__), + title=Messages.M003.format(code=r.status_code), exits=1) + return r.json() -def get_compatibility(ssl_verify): +def get_compatibility(): version = about.__version__ version = version.rsplit('.dev', 1)[0] - comp_table = get_json(about.__compatibility__, "compatibility table", ssl_verify) + comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table['spacy'] if version not in comp: prints(Messages.M006.format(version=version), title=Messages.M005, @@ -97,8 +77,10 @@ def get_version(model, comp): return comp[model][0] -def download_model(filename): +def download_model(filename, user_pip_args=None): download_url = about.__download_url__ + '/' + filename - return subprocess.call( - [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', '--no-deps', - download_url], env=os.environ.copy()) + pip_args = ['--no-cache-dir', '--no-deps'] + if user_pip_args: + pip_args.extend(user_pip_args) + cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url] + return subprocess.call(cmd, env=os.environ.copy()) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 2ef78beb5..6b0765c3e 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -5,9 +5,10 @@ import pkg_resources from pathlib import Path import sys import ujson +import requests from ._messages import Messages -from ..compat import path2str, locale_escape, url_read, HTTPError +from ..compat import path2str, locale_escape from ..util import prints, get_data_path, read_json from .. import about @@ -16,12 +17,11 @@ def validate(): """Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ - try: - data = url_read(about.__compatibility__) - except HTTPError as e: - title = Messages.M003.format(code=e.code, desc=e.reason) - prints(Messages.M021, title=title, exits=1) - compat = ujson.loads(data)['spacy'] + r = requests.get(about.__compatibility__) + if r.status_code != 200: + prints(Messages.M021, title=Messages.M003.format(code=r.status_code), + exits=1) + compat = r.json()['spacy'] current_compat = compat.get(about.__version__) if not current_compat: prints(about.__compatibility__, exits=1, diff --git a/spacy/compat.py b/spacy/compat.py index c5ddae0ce..dc0883542 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -33,16 +33,6 @@ try: except ImportError: from thinc.neural.optimizers import Adam as Optimizer -try: - import urllib.request -except ImportError: - import urllib2 as urllib - -try: - from urllib.error import HTTPError -except ImportError: - from urllib2 import HTTPError - pickle = pickle copy_reg = copy_reg CudaStream = CudaStream @@ -66,7 +56,6 @@ if is_python2: input_ = raw_input # noqa: F821 json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8') path2str = lambda path: str(path).decode('utf8') - url_open = urllib.urlopen elif is_python3: bytes_ = bytes @@ -75,16 +64,6 @@ elif is_python3: input_ = input json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False) path2str = lambda path: str(path) - url_open = urllib.request.urlopen - - -def url_read(url): - file_ = url_open(url) - code = file_.getcode() - if code != 200: - raise HTTPError(url, code, "Cannot GET url", [], file_) - data = file_.read() - return data def b_to_str(b_str): diff --git a/website/api/cli.jade b/website/api/cli.jade index a34271d81..199de82dc 100644 --- a/website/api/cli.jade +++ b/website/api/cli.jade @@ -15,7 +15,8 @@ p | package and automatically creates a | #[+a("/usage/models#usage") shortcut link] to load the model by name. | Direct downloads don't perform any compatibility checks and require the - | model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]). + | model name to be specified with its version (e.g. + | #[code en_core_web_sm-2.0.0]). +aside("Downloading best practices") | The #[code download] command is mostly intended as a convenient, @@ -35,13 +36,23 @@ p +row +cell #[code model] +cell positional - +cell Model name or shortcut (#[code en], #[code de], #[code vectors]). + +cell + | Model name or shortcut (#[code en], #[code de], + | #[code en_core_web_sm]). +row +cell #[code --direct], #[code -d] +cell flag +cell Force direct download of exact model version. + +row + +cell other + +cell - + +cell + | Additional installation options to be passed to + | #[code pip install] when installing the model package. For + | example, #[code --user] to install to the user home directory. + +row +cell #[code --help], #[code -h] +cell flag