From 03d2f98cd550ec3162bd27c33409c4d8413367ac Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sun, 15 Nov 2015 15:58:21 +0100 Subject: [PATCH 1/6] add sputnik --- spacy/en/download.py | 75 ++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 45 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index fcc1fe934..a10200899 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,65 +1,50 @@ -from __future__ import print_function import sys import os -import tarfile import shutil + import plac - -from . import uget +from sputnik import Sputnik -try: - FileExistsError -except NameError: - FileExistsError = Exception +def migrate(path): + data_path = os.path.join(path, 'data') + if os.path.isdir(data_path) and not os.path.islink(data_path): + shutil.rmtree(data_path) + for filename in os.listdir(path): + if filename.endswith('tgz'): + os.unlink(os.path.join(path, filename)) -# TODO: Read this from the same source as the setup -VERSION = '0.9.9' - -AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' - -ALL_DATA_DIR_URL = '%s/en_data_all-%s.tgz' % (AWS_STORE, VERSION) - -DEST_DIR = os.path.dirname(os.path.abspath(__file__)) - - -def download_file(url, download_path): - return uget.download(url, download_path, console=sys.stdout) - - -def install_data(url, extract_path, download_path): - try: - os.makedirs(extract_path) - except FileExistsError: - pass - - tmp = download_file(url, download_path) - assert tmp == download_path - t = tarfile.open(download_path) - t.extractall(extract_path) - os.unlink(download_path) +def link(package, path): + if os.path.exists(path): + os.unlink(path) + os.symlink(os.path.join(package.path, 'data'), + os.path.join(path)) @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) -def main(data_size='all', force=False): - filename = ALL_DATA_DIR_URL.rsplit('/', 1)[1] - download_path = os.path.join(DEST_DIR, filename) - data_path = os.path.join(DEST_DIR, 'data') +def main(force=False): + # TODO read version from the same source as the setup + sputnik = Sputnik('spacy', '0.99.0', console=sys.stdout) - if force and os.path.exists(download_path): - os.unlink(download_path) + path = os.path.dirname(os.path.abspath(__file__)) - if force and os.path.exists(data_path): - shutil.rmtree(data_path) + command = sputnik.make_command( + data_path=path, + repository_url=os.environ.get('REPOSITORY_URL')) - if os.path.exists(data_path): - print('data already installed at %s, overwrite with --force' % DEST_DIR) - sys.exit(1) + if force: + command.purge() - install_data(ALL_DATA_DIR_URL, DEST_DIR, download_path) + package = command.install('en_default') + + # FIXME clean up old-style packages + migrate(path) + + # FIXME supply spacy with an old-style data dir + link(package, os.path.join(path, 'data')) if __name__ == '__main__': From 12de895e60f2066dea2548d6cf91eb5cd7145129 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sun, 15 Nov 2015 16:38:16 +0100 Subject: [PATCH 2/6] fix version --- requirements.txt | 1 + setup.py | 2 +- spacy/en/download.py | 3 +-- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index ac7c69f9c..8a6e7dc9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ plac six ujson cloudpickle +sputnik == 0.4.1 diff --git a/setup.py b/setup.py index 40c25f42f..00b92368e 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle'], + 'ujson', 'cloudpickle', 'sputnik == 0.4.1'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) diff --git a/spacy/en/download.py b/spacy/en/download.py index a10200899..47a33ec92 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -18,8 +18,7 @@ def migrate(path): def link(package, path): if os.path.exists(path): os.unlink(path) - os.symlink(os.path.join(package.path, 'data'), - os.path.join(path)) + os.symlink(package.dir_path('data'), path) @plac.annotations( From 919a4f0b04da03b9c6ab19692b21bc3d0a2b3aeb Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 18 Nov 2015 11:40:46 +0100 Subject: [PATCH 3/6] change data path, add repository --- requirements.txt | 2 +- setup.py | 2 +- spacy/en/download.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8a6e7dc9a..ffa8664d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik == 0.4.1 +sputnik == 0.5.1 diff --git a/setup.py b/setup.py index 00b92368e..eec86b537 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik == 0.4.1'], + 'ujson', 'cloudpickle', 'sputnik == 0.5.1'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) diff --git a/spacy/en/download.py b/spacy/en/download.py index 47a33ec92..9e2066647 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -31,8 +31,8 @@ def main(force=False): path = os.path.dirname(os.path.abspath(__file__)) command = sputnik.make_command( - data_path=path, - repository_url=os.environ.get('REPOSITORY_URL')) + data_path=os.path.join(path, '..', 'data'), + repository_url='http://sputnik-production.elasticbeanstalk.com') if force: command.purge() From 02a1dcec766b8f2759bcc49fb7a387aa6b248786 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 18 Nov 2015 11:48:55 +0100 Subject: [PATCH 4/6] add data dir --- spacy/data/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 spacy/data/.gitignore diff --git a/spacy/data/.gitignore b/spacy/data/.gitignore new file mode 100644 index 000000000..5e7d2734c --- /dev/null +++ b/spacy/data/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore From 50d15ea5d2919cb64f27adc64ac8697c4f465c9c Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 18 Nov 2015 17:35:21 +0100 Subject: [PATCH 5/6] fix --- spacy/en/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 9e2066647..5a69d0f39 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -11,7 +11,7 @@ def migrate(path): if os.path.isdir(data_path) and not os.path.islink(data_path): shutil.rmtree(data_path) for filename in os.listdir(path): - if filename.endswith('tgz'): + if filename.endswith('.tgz'): os.unlink(os.path.join(path, filename)) @@ -24,14 +24,14 @@ def link(package, path): @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) -def main(force=False): +def main(data_size='all', force=False): # TODO read version from the same source as the setup sputnik = Sputnik('spacy', '0.99.0', console=sys.stdout) path = os.path.dirname(os.path.abspath(__file__)) command = sputnik.make_command( - data_path=os.path.join(path, '..', 'data'), + data_path=os.path.abspath(os.path.join(path, '..', 'data')), repository_url='http://sputnik-production.elasticbeanstalk.com') if force: From 73e5650be5b8055d225b846aa968864569cc9f62 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 18 Nov 2015 18:09:46 +0100 Subject: [PATCH 6/6] change index server --- spacy/en/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 5a69d0f39..26d2b44be 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -32,7 +32,7 @@ def main(data_size='all', force=False): command = sputnik.make_command( data_path=os.path.abspath(os.path.join(path, '..', 'data')), - repository_url='http://sputnik-production.elasticbeanstalk.com') + repository_url='https://index.spacy.io') if force: command.purge()