diff --git a/requirements.txt b/requirements.txt index ac7c69f9c..ffa8664d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ plac six ujson cloudpickle +sputnik == 0.5.1 diff --git a/setup.py b/setup.py index 7cc0894d1..3ecd0fd89 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle'], + 'ujson', 'cloudpickle', 'sputnik == 0.5.1'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) diff --git a/spacy/data/.gitignore b/spacy/data/.gitignore new file mode 100644 index 000000000..5e7d2734c --- /dev/null +++ b/spacy/data/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/spacy/en/download.py b/spacy/en/download.py index fcc1fe934..26d2b44be 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,65 +1,49 @@ -from __future__ import print_function import sys import os -import tarfile import shutil + import plac - -from . import uget +from sputnik import Sputnik -try: - FileExistsError -except NameError: - FileExistsError = Exception +def migrate(path): + data_path = os.path.join(path, 'data') + if os.path.isdir(data_path) and not os.path.islink(data_path): + shutil.rmtree(data_path) + for filename in os.listdir(path): + if filename.endswith('.tgz'): + os.unlink(os.path.join(path, filename)) -# TODO: Read this from the same source as the setup -VERSION = '0.9.9' - -AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' - -ALL_DATA_DIR_URL = '%s/en_data_all-%s.tgz' % (AWS_STORE, VERSION) - -DEST_DIR = os.path.dirname(os.path.abspath(__file__)) - - -def download_file(url, download_path): - return uget.download(url, download_path, console=sys.stdout) - - -def install_data(url, extract_path, download_path): - try: - os.makedirs(extract_path) - except FileExistsError: - pass - - tmp = download_file(url, download_path) - assert tmp == download_path - t = tarfile.open(download_path) - t.extractall(extract_path) - os.unlink(download_path) +def link(package, path): + if os.path.exists(path): + os.unlink(path) + os.symlink(package.dir_path('data'), path) @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - filename = ALL_DATA_DIR_URL.rsplit('/', 1)[1] - download_path = os.path.join(DEST_DIR, filename) - data_path = os.path.join(DEST_DIR, 'data') + # TODO read version from the same source as the setup + sputnik = Sputnik('spacy', '0.99.0', console=sys.stdout) - if force and os.path.exists(download_path): - os.unlink(download_path) + path = os.path.dirname(os.path.abspath(__file__)) - if force and os.path.exists(data_path): - shutil.rmtree(data_path) + command = sputnik.make_command( + data_path=os.path.abspath(os.path.join(path, '..', 'data')), + repository_url='https://index.spacy.io') - if os.path.exists(data_path): - print('data already installed at %s, overwrite with --force' % DEST_DIR) - sys.exit(1) + if force: + command.purge() - install_data(ALL_DATA_DIR_URL, DEST_DIR, download_path) + package = command.install('en_default') + + # FIXME clean up old-style packages + migrate(path) + + # FIXME supply spacy with an old-style data dir + link(package, os.path.join(path, 'data')) if __name__ == '__main__':